gty-sermon-scraper/downloader.py

import json
import multiprocessing

import httpx

from pathlib import Path

sermon_transcript_url = 'https://www.gty.org/library/print/sermons-library/{code}'
audio_download_url = "https://cdn.gty.org/sermons/High/{code}.mp3"

local_audio = Path(f'/run/media/bear/data/audio/')
local_transcript = Path(f'/run/media/bear/data/transcrips')

def download_transcript(sermon):
    transcript = local_transcript / f'{sermon['code']}.html'
    transcript.parent.mkdir(exist_ok=True, parents=True)

    if transcript.exists():
        print(f'Skipping {transcript.name}')
        return

    response = httpx.get(sermon_transcript_url.format(**sermon))

    transcript.write_text(response.text)
    print(f'Downloaded {transcript.name}')


def download_audio(sermon):
    # if sermon['fileName'] == '':
    #     sermon['fileName'] = f'{sermon['code']}.mp3'


    audio = local_audio / f'{sermon['code']}.mp3'
    audio.parent.mkdir(exist_ok=True, parents=True)

    if audio.exists():
        print(f'Skipping {audio.name}')
        return

    response = httpx.get(audio_download_url.format(**sermon), timeout=60)
    with open(audio, 'wb') as f:
        f.write(response.content)

    print(f'Downloaded {audio.name}')


def main():
    for index in list(Path('./api_data').glob('*')):
        sermons = json.loads(index.read_text())

        with multiprocessing.Pool() as p:
            p.map(download_audio, sermons)

        # with multiprocessing.Pool() as p:
        #     p.map(download_transcript, sermons)


if __name__ == '__main__':
    main()