Commit 79cbbac6 authored by Bryan Tarpley's avatar Bryan Tarpley
Browse files

Enabling the downloading of all ARC artifacts at once.

parent c6150cff
Showing with 104 additions and 41 deletions
+104 -41
......@@ -2,12 +2,56 @@ import os
import sys
import json
import requests
import traceback
from math import ceil, floor
# ------------------------------------ #
# utilities #
# ------------------------------------ #
# this function does all the heavy lifting. it takes a dictionary of information representing an ARC archive,
# the number of artifacts from that archive to download, the path to the download directory, whether to get
# only the full text for the archive, the corpora host, and the corpus ID for the ARC corpus.
def download_archive_data(archive, num_artifacts, download_directory, get_full_text, corpora_host, corpus_id):
print(
f"\nDownloading {desired_data} to the folder '{download_directory}/{archive['handle']}.' Depending on the number of artifacts, this may take some time.\n")
download_directory = f"{download_directory}/{archive['handle']}"
os.makedirs(download_directory, exist_ok=True)
batches_downloaded = 0
batches_needed = ceil(num_artifacts / batch_size)
show_progress(batches_downloaded, batches_needed)
while batches_downloaded < batches_needed:
skip = batches_downloaded * batch_size
limit = batch_size
if skip + limit > num_artifacts:
limit = num_artifacts - skip
if get_full_text:
ft_req = requests.get(
f"{corpora_host}/api/corpus/{corpus_id}/ArcArtifact/?f_archive.id={archive['id']}&only=external_uri,full_text_contents&page={batches_downloaded + 1}&page-size={batch_size}")
ft = ft_req.json()
if 'records' in ft:
with open(f"{download_directory}/{batches_downloaded + 1}.json", 'w', encoding='utf-8') as ft_out:
json.dump(ft['records'], ft_out, indent=4)
else:
ttl_req = requests.get(
f"{corpora_host}/corpus/{corpus_id}/ArcArchive/{archive['id']}/LINCS/?skip={skip}&limit={limit}")
ttl = ttl_req.text
with open(f"{download_directory}/{batches_downloaded + 1}.ttl", 'w', encoding='utf-8') as ttl_out:
ttl_out.write(ttl)
batches_downloaded += 1
show_progress(batches_downloaded, batches_needed)
# takes a list of dictionaries, each with a 'label' key,
# and prints out a multi-columned menu of options to select from.
# it returns the selected dictionary.
......@@ -44,13 +88,16 @@ def get_selection(options, prompt):
print(" "*indent + (" "*pad).join(f"[{option['num']}] {option['label']}".ljust(col_width) for option in row))
choice_found = False
while not choice_found:
choice = input(f"\n{prompt} ")
for option in options:
if choice == str(option['num']):
choice_found = True
return option
if choice == 'ALL':
choice_found = True
return options
else:
for option in options:
if choice == str(option['num']):
choice_found = True
return option
print('Invalid selection! Try again...')
......@@ -139,6 +186,7 @@ archive_options = []
req = requests.get(aggregation_url)
archs_meta = req.json()
if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in archs_meta['meta']['aggregations']:
total_artifacts = archs_meta['meta']['total']
archs_meta = archs_meta['meta']['aggregations']['archs']
req = requests.get(f'{corpora_host}/api/corpus/{corpus_id}/ArcArchive/?page-size=1000&s_name=asc')
......@@ -155,7 +203,15 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
'count': archs_meta[arch['handle']]
})
archive = get_selection(archive_options, "Choose an ARC archive:")
archive = get_selection(archive_options, "Choose an ARC archive, or enter ALL:")
total_possible_artifacts = 0
num_to_skip = 0
if type(archive) == list:
total_possible_artifacts = total_artifacts
num_to_skip = -1
else:
total_possible_artifacts = archive['count']
archive = [archive]
num_artifacts = 0
while num_artifacts == 0:
......@@ -163,49 +219,56 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
if get_full_text:
desired_data = "full text"
choice = input(f"For how many ARC artifacts do you want {desired_data} [{archive['count']}]? ")
choice = input(f"For how many ARC artifacts do you want {desired_data} [{total_possible_artifacts}]? ")
if choice == '':
num_artifacts = archive['count']
num_artifacts = total_possible_artifacts
elif choice.isdigit():
num_artifacts = int(choice)
if num_artifacts > archive['count']:
num_artifacts = archive['count']
if num_artifacts > total_possible_artifacts:
num_artifacts = total_possible_artifacts
else:
print('Not a valid number of artifacts. Try again...')
print(f"\nDownloading {desired_data} to the folder '{archive['handle']}.' Depending on the number of artifacts, this may take some time.\n")
download_directory = f"{download_directory}/{archive['handle']}"
os.makedirs(download_directory, exist_ok=True)
batches_downloaded = 0
batches_needed = ceil(num_artifacts / batch_size)
show_progress(batches_downloaded, batches_needed)
while batches_downloaded < batches_needed:
skip = batches_downloaded * batch_size
limit = batch_size
if skip + limit > num_artifacts:
limit = num_artifacts - skip
if get_full_text:
ft_req = requests.get(f"{corpora_host}/api/corpus/{corpus_id}/ArcArtifact/?f_archive.id={archive['id']}&only=external_uri,full_text_contents&page={batches_downloaded + 1}&page-size={batch_size}")
ft = ft_req.json()
if 'records' in ft:
with open(f"{download_directory}/{batches_downloaded + 1}.json", 'w', encoding='utf-8') as ft_out:
json.dump(ft['records'], ft_out, indent=4)
while num_to_skip == -1:
choice = input(f"How many artifacts would you like to skip (useful for resuming a long-running job) [0]? ")
if choice == '':
num_to_skip = 0
elif choice.isdigit():
num_to_skip = int(choice)
else:
ttl_req = requests.get(f"{corpora_host}/corpus/{corpus_id}/ArcArchive/{archive['id']}/LINCS/?skip={skip}&limit={limit}")
ttl = ttl_req.text
with open(f"{download_directory}/{batches_downloaded + 1}.ttl", 'w', encoding='utf-8') as ttl_out:
ttl_out.write(ttl)
print('Not a valid number of artifacts. Try again...')
batches_downloaded += 1
show_progress(batches_downloaded, batches_needed)
artifacts_remaining = num_artifacts
num_skipped = 0
for arch in archive:
if artifacts_remaining:
arch_limit = arch['count']
if artifacts_remaining < arch_limit:
arch_limit = artifacts_remaining
skip_this_archive = False
if num_to_skip > 0:
if num_skipped + arch_limit < num_to_skip:
skip_this_archive = True
num_skipped += arch_limit
if not skip_this_archive:
try:
download_archive_data(
arch,
arch_limit,
download_directory,
get_full_text,
corpora_host,
corpus_id
)
except:
print("Oh, no! An error occurred:")
print(traceback.format_exc())
print(f"\n\nWhen resuming this job, you might skip the first {num_artifacts - artifacts_remaining} artifacts.")
artifacts_remaining -= arch_limit
print(f"\n{num_artifacts - artifacts_remaining} total artifacts downloaded so far.")
print("\nDownload completed!")
exit()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment