Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Bryan Tarpley
arc_data_downloader
Commits
79cbbac6
Commit
79cbbac6
authored
1 year ago
by
Bryan Tarpley
Browse files
Options
Download
Email Patches
Plain Diff
Enabling the downloading of all ARC artifacts at once.
parent
c6150cff
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
download.py
+104
-41
download.py
with
104 additions
and
41 deletions
+104
-41
download.py
+
104
-
41
View file @
79cbbac6
...
...
@@ -2,12 +2,56 @@ import os
import
sys
import
json
import
requests
import
traceback
from
math
import
ceil
,
floor
# ------------------------------------ #
# utilities #
# ------------------------------------ #
# this function does all the heavy lifting. it takes a dictionary of information representing an ARC archive,
# the number of artifacts from that archive to download, the path to the download directory, whether to get
# only the full text for the archive, the corpora host, and the corpus ID for the ARC corpus.
def
download_archive_data
(
archive
,
num_artifacts
,
download_directory
,
get_full_text
,
corpora_host
,
corpus_id
):
print
(
f
"
\n
Downloading
{
desired_data
}
to the folder '
{
download_directory
}
/
{
archive
[
'handle'
]
}
.' Depending on the number of artifacts, this may take some time.
\n
"
)
download_directory
=
f
"
{
download_directory
}
/
{
archive
[
'handle'
]
}
"
os
.
makedirs
(
download_directory
,
exist_ok
=
True
)
batches_downloaded
=
0
batches_needed
=
ceil
(
num_artifacts
/
batch_size
)
show_progress
(
batches_downloaded
,
batches_needed
)
while
batches_downloaded
<
batches_needed
:
skip
=
batches_downloaded
*
batch_size
limit
=
batch_size
if
skip
+
limit
>
num_artifacts
:
limit
=
num_artifacts
-
skip
if
get_full_text
:
ft_req
=
requests
.
get
(
f
"
{
corpora_host
}
/api/corpus/
{
corpus_id
}
/ArcArtifact/?f_archive.id=
{
archive
[
'id'
]
}
&only=external_uri,full_text_contents&page=
{
batches_downloaded
+
1
}
&page-size=
{
batch_size
}
"
)
ft
=
ft_req
.
json
()
if
'records'
in
ft
:
with
open
(
f
"
{
download_directory
}
/
{
batches_downloaded
+
1
}
.json"
,
'w'
,
encoding
=
'utf-8'
)
as
ft_out
:
json
.
dump
(
ft
[
'records'
],
ft_out
,
indent
=
4
)
else
:
ttl_req
=
requests
.
get
(
f
"
{
corpora_host
}
/corpus/
{
corpus_id
}
/ArcArchive/
{
archive
[
'id'
]
}
/LINCS/?skip=
{
skip
}
&limit=
{
limit
}
"
)
ttl
=
ttl_req
.
text
with
open
(
f
"
{
download_directory
}
/
{
batches_downloaded
+
1
}
.ttl"
,
'w'
,
encoding
=
'utf-8'
)
as
ttl_out
:
ttl_out
.
write
(
ttl
)
batches_downloaded
+=
1
show_progress
(
batches_downloaded
,
batches_needed
)
# takes a list of dictionaries, each with a 'label' key,
# and prints out a multi-columned menu of options to select from.
# it returns the selected dictionary.
...
...
@@ -44,13 +88,16 @@ def get_selection(options, prompt):
print
(
" "
*
indent
+
(
" "
*
pad
).
join
(
f
"[
{
option
[
'num'
]
}
]
{
option
[
'label'
]
}
"
.
ljust
(
col_width
)
for
option
in
row
))
choice_found
=
False
while
not
choice_found
:
choice
=
input
(
f
"
\n
{
prompt
}
"
)
for
option
in
options
:
if
choice
==
str
(
option
[
'num'
]):
choice_found
=
True
return
option
if
choice
==
'ALL'
:
choice_found
=
True
return
options
else
:
for
option
in
options
:
if
choice
==
str
(
option
[
'num'
]):
choice_found
=
True
return
option
print
(
'Invalid selection! Try again...'
)
...
...
@@ -139,6 +186,7 @@ archive_options = []
req
=
requests
.
get
(
aggregation_url
)
archs_meta
=
req
.
json
()
if
'meta'
in
archs_meta
and
'aggregations'
in
archs_meta
[
'meta'
]
and
'archs'
in
archs_meta
[
'meta'
][
'aggregations'
]:
total_artifacts
=
archs_meta
[
'meta'
][
'total'
]
archs_meta
=
archs_meta
[
'meta'
][
'aggregations'
][
'archs'
]
req
=
requests
.
get
(
f
'
{
corpora_host
}
/api/corpus/
{
corpus_id
}
/ArcArchive/?page-size=1000&s_name=asc'
)
...
...
@@ -155,7 +203,15 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
'count'
:
archs_meta
[
arch
[
'handle'
]]
})
archive
=
get_selection
(
archive_options
,
"Choose an ARC archive:"
)
archive
=
get_selection
(
archive_options
,
"Choose an ARC archive, or enter ALL:"
)
total_possible_artifacts
=
0
num_to_skip
=
0
if
type
(
archive
)
==
list
:
total_possible_artifacts
=
total_artifacts
num_to_skip
=
-
1
else
:
total_possible_artifacts
=
archive
[
'count'
]
archive
=
[
archive
]
num_artifacts
=
0
while
num_artifacts
==
0
:
...
...
@@ -163,49 +219,56 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
if
get_full_text
:
desired_data
=
"full text"
choice
=
input
(
f
"For how many ARC artifacts do you want
{
desired_data
}
[
{
archive
[
'count'
]
}
]? "
)
choice
=
input
(
f
"For how many ARC artifacts do you want
{
desired_data
}
[
{
total_possible_artifacts
}
]? "
)
if
choice
==
''
:
num_artifacts
=
archive
[
'count'
]
num_artifacts
=
total_possible_artifacts
elif
choice
.
isdigit
():
num_artifacts
=
int
(
choice
)
if
num_artifacts
>
archive
[
'count'
]
:
num_artifacts
=
archive
[
'count'
]
if
num_artifacts
>
total_possible_artifacts
:
num_artifacts
=
total_possible_artifacts
else
:
print
(
'Not a valid number of artifacts. Try again...'
)
print
(
f
"
\n
Downloading
{
desired_data
}
to the folder '
{
archive
[
'handle'
]
}
.' Depending on the number of artifacts, this may take some time.
\n
"
)
download_directory
=
f
"
{
download_directory
}
/
{
archive
[
'handle'
]
}
"
os
.
makedirs
(
download_directory
,
exist_ok
=
True
)
batches_downloaded
=
0
batches_needed
=
ceil
(
num_artifacts
/
batch_size
)
show_progress
(
batches_downloaded
,
batches_needed
)
while
batches_downloaded
<
batches_needed
:
skip
=
batches_downloaded
*
batch_size
limit
=
batch_size
if
skip
+
limit
>
num_artifacts
:
limit
=
num_artifacts
-
skip
if
get_full_text
:
ft_req
=
requests
.
get
(
f
"
{
corpora_host
}
/api/corpus/
{
corpus_id
}
/ArcArtifact/?f_archive.id=
{
archive
[
'id'
]
}
&only=external_uri,full_text_contents&page=
{
batches_downloaded
+
1
}
&page-size=
{
batch_size
}
"
)
ft
=
ft_req
.
json
()
if
'records'
in
ft
:
with
open
(
f
"
{
download_directory
}
/
{
batches_downloaded
+
1
}
.json"
,
'w'
,
encoding
=
'utf-8'
)
as
ft_out
:
json
.
dump
(
ft
[
'records'
],
ft_out
,
indent
=
4
)
while
num_to_skip
==
-
1
:
choice
=
input
(
f
"How many artifacts would you like to skip (useful for resuming a long-running job) [0]? "
)
if
choice
==
''
:
num_to_skip
=
0
elif
choice
.
isdigit
():
num_to_skip
=
int
(
choice
)
else
:
ttl_req
=
requests
.
get
(
f
"
{
corpora_host
}
/corpus/
{
corpus_id
}
/ArcArchive/
{
archive
[
'id'
]
}
/LINCS/?skip=
{
skip
}
&limit=
{
limit
}
"
)
ttl
=
ttl_req
.
text
with
open
(
f
"
{
download_directory
}
/
{
batches_downloaded
+
1
}
.ttl"
,
'w'
,
encoding
=
'utf-8'
)
as
ttl_out
:
ttl_out
.
write
(
ttl
)
print
(
'Not a valid number of artifacts. Try again...'
)
batches_downloaded
+=
1
show_progress
(
batches_downloaded
,
batches_needed
)
artifacts_remaining
=
num_artifacts
num_skipped
=
0
for
arch
in
archive
:
if
artifacts_remaining
:
arch_limit
=
arch
[
'count'
]
if
artifacts_remaining
<
arch_limit
:
arch_limit
=
artifacts_remaining
skip_this_archive
=
False
if
num_to_skip
>
0
:
if
num_skipped
+
arch_limit
<
num_to_skip
:
skip_this_archive
=
True
num_skipped
+=
arch_limit
if
not
skip_this_archive
:
try
:
download_archive_data
(
arch
,
arch_limit
,
download_directory
,
get_full_text
,
corpora_host
,
corpus_id
)
except
:
print
(
"Oh, no! An error occurred:"
)
print
(
traceback
.
format_exc
())
print
(
f
"
\n\n
When resuming this job, you might skip the first
{
num_artifacts
-
artifacts_remaining
}
artifacts."
)
artifacts_remaining
-=
arch_limit
print
(
f
"
\n
{
num_artifacts
-
artifacts_remaining
}
total artifacts downloaded so far."
)
print
(
"
\n
Download completed!"
)
exit
()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help