|
1 | 1 | #!/usr/bin/python
|
2 | 2 |
|
3 | 3 | import sys
|
4 |
| -import urllib2 |
| 4 | +import urllib.request |
5 | 5 | import json
|
6 | 6 | import os
|
7 | 7 | import base64, uuid
|
8 | 8 | import re
|
| 9 | +import logging |
9 | 10 |
|
10 | 11 | CACHEDIR="cache"
|
11 | 12 | if not os.path.exists(CACHEDIR):
|
12 | 13 | os.makedirs(CACHEDIR)
|
13 | 14 |
|
14 | 15 | def cacheFilename(url):
|
15 |
| - z=base64.b64encode(url).rstrip('=\n').replace('/', '_') |
| 16 | + z=base64.b64encode(url.encode("utf-8")).decode("utf-8").rstrip('=\n').replace('/', '_') |
16 | 17 | return z + '.json'
|
17 | 18 |
|
18 | 19 | def fetch(url, retry=0):
|
19 | 20 | cached = os.path.join(CACHEDIR, cacheFilename(url))
|
20 | 21 | if os.path.exists(cached):
|
21 |
| - fh = file(cached) |
22 |
| - data = fh.read() |
23 |
| - fh.close() |
| 22 | + logging.debug('Getting %s from cache: %s',url, cached) |
| 23 | + with open(cached, encoding = 'utf-8') as fh: |
| 24 | + data = json.loads(fh.read()) |
24 | 25 | else:
|
25 |
| - request = urllib2.Request(url) |
| 26 | + request = urllib.request.Request(url) |
26 | 27 | if '@' in url:
|
27 | 28 | result= re.search(r"\/\/(.*)@", url)
|
28 | 29 | url = re.sub(r"\/\/*.*@", r'//', url)
|
29 |
| - request = urllib2.Request(url) |
| 30 | + request = urllib.request.Request(url) |
30 | 31 | if result:
|
31 | 32 | base64string = base64.b64encode(result.group(1))
|
32 | 33 | request.add_header("Authorization", "Basic %s" % base64string)
|
33 | 34 |
|
34 | 35 | try:
|
35 |
| - fh = urllib2.urlopen(request) |
36 |
| - data = fh.read() |
| 36 | + fh = urllib.request.urlopen(request) |
| 37 | + data = json.loads(fh.read()) |
37 | 38 | fh.close()
|
38 |
| - fh = file(cached, 'w') |
39 |
| - fh.write(data) |
40 |
| - fh.close() |
41 |
| - except urllib2.HTTPError as error: |
42 |
| - print("Getting " + url + " failed due to " + str(error.code) + " " + error.reason + " retry " + str(retry)) |
| 39 | + |
| 40 | + with open(cached, 'w', encoding='utf-8') as f: |
| 41 | + json.dump(data, f, ensure_ascii=False, indent=4) |
| 42 | + |
| 43 | + except urllib.request.HTTPError as error: |
| 44 | + data = None |
| 45 | + # 404 means no annotations for this canvas |
| 46 | + logging.error("Getting %s failed due to %s: %s (Rery: %s)", url, error.code, error.reason, retry) |
43 | 47 | if error.code == 500 and retry < 5:
|
44 | 48 | return fetch(url, retry+1)
|
| 49 | + except urllib.error.URLError as error: |
| 50 | + data = None |
| 51 | + # 404 means no annotations for this canvas |
| 52 | + logging.error("Failed to get %s due to %s. Do you have the correct URL for SAS and is it running?", url, error) |
45 | 53 |
|
46 | 54 | return data
|
47 | 55 | if __name__ == "__main__":
|
48 |
| - |
| 56 | + logging.basicConfig( encoding='utf-8', level=logging.ERROR) |
49 | 57 | if len(sys.argv) < 4:
|
50 | 58 | print("Usage:\n\tdownloadAnnotationListsByCanvas.py [manifest] [sas_endpoint] [output_dir] [optional outputfilename proc]")
|
51 | 59 | print ("Arg no = %s" % len(sys.argv))
|
52 | 60 | sys.exit(0)
|
53 | 61 |
|
54 |
| - print ("Downloading manifest") |
55 |
| - manifest = json.loads(fetch(sys.argv[1])) |
| 62 | + print ("Downloading manifest: {}".format(sys.argv[1])) |
| 63 | + manifest = fetch(sys.argv[1]) |
| 64 | + if not manifest: |
| 65 | + print ('Failed to load manifest') |
| 66 | + exit(-1) |
| 67 | + sasEndpoint = sys.argv[2] |
| 68 | + if sasEndpoint.endswith('/'): |
| 69 | + # remove last slash |
| 70 | + sasEndpoint = sasEndpoint[:-1] |
56 | 71 |
|
57 | 72 | count=0
|
58 | 73 | for canvas in manifest["sequences"][0]["canvases"]:
|
59 | 74 | count += 1
|
60 |
| - print ("Downloading %s " % canvas["@id"]) |
61 |
| - annoListData = fetch("%s/annotation/search?uri=%s" % (sys.argv[2], canvas["@id"])) |
62 |
| - # add list to resource |
63 |
| - annoList = { |
64 |
| - "@type" : "sc:AnnotationList", |
65 |
| - "context": "http://iiif.io/api/presentation/2/context.json", |
66 |
| - "resources": json.loads(annoListData) |
67 |
| - } |
68 |
| - if len(sys.argv) > 4 and sys.argv[4] == 'nlw': |
69 |
| - filename = canvas["@id"].split('/')[-1] |
70 |
| - else: |
71 |
| - filename = "page%s.json" % count |
72 |
| - with open("%s/%s" % (sys.argv[3],filename), 'wb') as outfile: |
73 |
| - json.dump(annoList, outfile, sort_keys=False,indent=4, separators=(',', ': ')) |
74 |
| - outfile.close() |
| 75 | + annoListData = fetch("%s/annotation/search?uri=%s" % (sasEndpoint, canvas["@id"])) |
| 76 | + if annoListData: |
| 77 | + print ("Downloaded annotations for canvas: {} ".format(canvas["@id"])) |
| 78 | + # add list to resource |
| 79 | + annoList = { |
| 80 | + "@type" : "sc:AnnotationList", |
| 81 | + "context": "http://iiif.io/api/presentation/2/context.json", |
| 82 | + "resources": annoListData |
| 83 | + } |
| 84 | + if len(sys.argv) > 4 and sys.argv[4] == 'nlw': |
| 85 | + filename = canvas["@id"].split('/')[-1] |
| 86 | + else: |
| 87 | + filename = "page%s.json" % count |
| 88 | + |
| 89 | + outputDirectory = sys.argv[3] |
| 90 | + outFilename = "%s/%s" % (outputDirectory,filename) |
| 91 | + if not os.path.exists(outputDirectory): |
| 92 | + os.makedirs(outputDirectory) |
| 93 | + |
| 94 | + with open(outFilename, 'w') as outfile: |
| 95 | + json.dump(annoList, outfile, indent=4) |
| 96 | + print ('Saved file: {}'.format(outFilename)) |
| 97 | + #else: |
| 98 | + # print ('No annotations for canvas: {}'.format(canvas['@id'])) |
75 | 99 | #except:
|
76 | 100 | # print (annoListData)
|
0 commit comments