Files
gty-sermon-scraper/indexer.py
2025-07-19 20:38:39 -04:00

90 lines
2.0 KiB
Python

import json
import multiprocessing
from pathlib import Path
import httpx
from loguru import logger
info_url = 'https://ca.gty.org/api/Library/GetInitialSectionInformation/2/US'
api_url = "https://ca.gty.org/api/Library/GetResources/2/code/{code}/none/none/en/{page}/US"
def get_codes(url):
r = httpx.get(url)
r.raise_for_status()
codes = [x['value'] for x in r.json()['possibleCodes']]
return codes
def get_sermon_data(code):
print(f'{code=}')
# Start with page 1
page = 1
data = get_api_data(code, page)
total_sermons = data['totalNumberOfRecords']
if new_sermon_check(code, total_sermons) is False:
print('\tNo new sermons')
return
all_sermons = list(data['items'])
print(f'\t{total_sermons=}')
print(f'\t{len(all_sermons)=}')
# Continue with the next page if needed.
while len(all_sermons) < total_sermons:
page += 1
print(f'\t{page=}')
data = get_api_data(code, page)
sermons = list(data['items'])
if len(sermons) == 0:
print('NO MORE SERMONS')
break
all_sermons += sermons
print(f'\t{len(all_sermons)=}')
json.dump(all_sermons, Path(f'./api_data/{code}.json').open('w'), indent=4)
return all_sermons
def get_api_data(code, page):
with httpx.Client() as client:
response = client.get(api_url.format(code=code, page=page))
response.raise_for_status()
return response.json()
def new_sermon_check(code, total_sermons):
data_path = Path(f'./api_data/{code}.json')
if data_path.exists() is False:
print('\tNO FILE')
return True
data = json.loads(data_path.read_text())
return len(data) != total_sermons
def main():
for code in get_codes(info_url):
get_sermon_data(code)
# codes = get_codes(info_url)
#
# with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
# p.map(get_sermon_data, codes)
if __name__ == "__main__":
main()