gty-sermon-scraper/indexer.py

import json
import multiprocessing
from pathlib import Path

import httpx
from loguru import logger


info_url = 'https://ca.gty.org/api/Library/GetInitialSectionInformation/2/US'
api_url = "https://ca.gty.org/api/Library/GetResources/2/code/{code}/none/none/en/{page}/US"


def get_codes(url):
    r = httpx.get(url)
    r.raise_for_status()

    codes = [x['value'] for x in r.json()['possibleCodes']]

    return codes

def get_sermon_data(code):
    print(f'{code=}')

    # Start with page 1
    page = 1
    data = get_api_data(code, page)

    total_sermons = data['totalNumberOfRecords']

    if new_sermon_check(code, total_sermons) is False:
        print('\tNo new sermons')
        return

    all_sermons = list(data['items'])


    print(f'\t{total_sermons=}')
    print(f'\t{len(all_sermons)=}')

    # Continue with the next page if needed.
    while len(all_sermons) < total_sermons:
        page += 1
        print(f'\t{page=}')
        data = get_api_data(code, page)
        sermons = list(data['items'])


        if len(sermons) == 0:
            print('NO MORE SERMONS')
            break

        all_sermons += sermons
        print(f'\t{len(all_sermons)=}')

    json.dump(all_sermons, Path(f'./api_data/{code}.json').open('w'), indent=4)

    return all_sermons


def get_api_data(code, page):
    with httpx.Client() as client:
        response = client.get(api_url.format(code=code, page=page))
        response.raise_for_status()
    return response.json()


def new_sermon_check(code, total_sermons):
    data_path = Path(f'./api_data/{code}.json')

    if data_path.exists() is False:
        print('\tNO FILE')
        return True

    data = json.loads(data_path.read_text())
    return len(data) != total_sermons

def main():

    for code in get_codes(info_url):
        get_sermon_data(code)


    # codes = get_codes(info_url)
    #
    # with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
    #     p.map(get_sermon_data, codes)

if __name__ == "__main__":
    main()