This commit is contained in:
2025-07-19 20:38:39 -04:00
commit 9e20e634e2
55 changed files with 79094 additions and 0 deletions

61
downloader.py Normal file
View File

@@ -0,0 +1,61 @@
import json
import multiprocessing
import httpx
from pathlib import Path
sermon_transcript_url = 'https://www.gty.org/library/print/sermons-library/{code}'
audio_download_url = "https://cdn.gty.org/sermons/High/{code}.mp3"
local_audio = Path(f'/run/media/bear/data/audio/')
local_transcript = Path(f'/run/media/bear/data/transcrips')
def download_transcript(sermon):
transcript = local_transcript / f'{sermon['code']}.html'
transcript.parent.mkdir(exist_ok=True, parents=True)
if transcript.exists():
print(f'Skipping {transcript.name}')
return
response = httpx.get(sermon_transcript_url.format(**sermon))
transcript.write_text(response.text)
print(f'Downloaded {transcript.name}')
def download_audio(sermon):
# if sermon['fileName'] == '':
# sermon['fileName'] = f'{sermon['code']}.mp3'
audio = local_audio / f'{sermon['code']}.mp3'
audio.parent.mkdir(exist_ok=True, parents=True)
if audio.exists():
print(f'Skipping {audio.name}')
return
response = httpx.get(audio_download_url.format(**sermon), timeout=60)
with open(audio, 'wb') as f:
f.write(response.content)
print(f'Downloaded {audio.name}')
def main():
for index in list(Path('./api_data').glob('*')):
sermons = json.loads(index.read_text())
with multiprocessing.Pool() as p:
p.map(download_audio, sermons)
# with multiprocessing.Pool() as p:
# p.map(download_transcript, sermons)
if __name__ == '__main__':
main()