import json import multiprocessing import httpx from pathlib import Path sermon_transcript_url = 'https://www.gty.org/library/print/sermons-library/{code}' audio_download_url = "https://cdn.gty.org/sermons/High/{code}.mp3" local_audio = Path(f'/run/media/bear/data/audio/') local_transcript = Path(f'/run/media/bear/data/transcrips') def download_transcript(sermon): transcript = local_transcript / f'{sermon['code']}.html' transcript.parent.mkdir(exist_ok=True, parents=True) if transcript.exists(): print(f'Skipping {transcript.name}') return response = httpx.get(sermon_transcript_url.format(**sermon)) transcript.write_text(response.text) print(f'Downloaded {transcript.name}') def download_audio(sermon): # if sermon['fileName'] == '': # sermon['fileName'] = f'{sermon['code']}.mp3' audio = local_audio / f'{sermon['code']}.mp3' audio.parent.mkdir(exist_ok=True, parents=True) if audio.exists(): print(f'Skipping {audio.name}') return response = httpx.get(audio_download_url.format(**sermon), timeout=60) with open(audio, 'wb') as f: f.write(response.content) print(f'Downloaded {audio.name}') def main(): for index in list(Path('./api_data').glob('*')): sermons = json.loads(index.read_text()) with multiprocessing.Pool() as p: p.map(download_audio, sermons) # with multiprocessing.Pool() as p: # p.map(download_transcript, sermons) if __name__ == '__main__': main()