|
1 |
| -FROM ubuntu:20.04 |
| 1 | +FROM python:3.9 |
2 | 2 | ENV DEBIAN_FRONTEND noninteractive
|
3 | 3 |
|
4 | 4 | LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
|
5 | 5 | LABEL org.opencontainers.image.licenses MIT
|
6 | 6 | LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
|
7 | 7 |
|
8 | 8 | # Enable non-free archive for `unrar`.
|
9 |
| -# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list |
10 |
| -RUN apt-get -qq -y update \ |
11 |
| - && apt-get -qq -y install build-essential locales ca-certificates \ |
12 |
| - # git |
13 |
| - git \ |
14 |
| - # python deps (mostly to install their dependencies) |
15 |
| - python3-pip python3-dev python3-pil \ |
16 |
| - # tesseract |
17 |
| - tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\ |
18 |
| - # libraries |
19 |
| - libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \ |
20 |
| - zlib1g-dev libicu-dev libxml2-dev \ |
21 |
| - # package tools |
22 |
| - unrar p7zip-full \ |
23 |
| - # audio & video metadata |
24 |
| - libmediainfo-dev \ |
25 |
| - # image processing, djvu |
26 |
| - imagemagick-common imagemagick mdbtools djvulibre-bin \ |
27 |
| - libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ |
28 |
| - libtiff-tools ghostscript librsvg2-bin jbig2dec \ |
29 |
| - pst-utils \ |
30 |
| - ### tesseract |
31 |
| - tesseract-ocr-eng \ |
32 |
| - tesseract-ocr-swa \ |
33 |
| - tesseract-ocr-swe \ |
34 |
| - # tesseract-ocr-tam \ |
35 |
| - # tesseract-ocr-tel \ |
36 |
| - tesseract-ocr-fil \ |
37 |
| - # tesseract-ocr-tha \ |
38 |
| - tesseract-ocr-tur \ |
39 |
| - tesseract-ocr-ukr \ |
40 |
| - # tesseract-ocr-vie \ |
41 |
| - tesseract-ocr-nld \ |
42 |
| - tesseract-ocr-nor \ |
43 |
| - tesseract-ocr-pol \ |
44 |
| - tesseract-ocr-por \ |
45 |
| - tesseract-ocr-ron \ |
46 |
| - tesseract-ocr-rus \ |
47 |
| - tesseract-ocr-slk \ |
48 |
| - tesseract-ocr-slv \ |
49 |
| - tesseract-ocr-spa \ |
50 |
| - # tesseract-ocr-spa_old \ |
51 |
| - tesseract-ocr-sqi \ |
52 |
| - tesseract-ocr-srp \ |
53 |
| - tesseract-ocr-ind \ |
54 |
| - tesseract-ocr-isl \ |
55 |
| - tesseract-ocr-ita \ |
56 |
| - # tesseract-ocr-ita_old \ |
57 |
| - # tesseract-ocr-jpn \ |
58 |
| - tesseract-ocr-kan \ |
59 |
| - tesseract-ocr-kat \ |
60 |
| - # tesseract-ocr-kor \ |
61 |
| - tesseract-ocr-khm \ |
62 |
| - tesseract-ocr-lav \ |
63 |
| - tesseract-ocr-lit \ |
64 |
| - # tesseract-ocr-mal \ |
65 |
| - tesseract-ocr-mkd \ |
66 |
| - tesseract-ocr-mya \ |
67 |
| - tesseract-ocr-mlt \ |
68 |
| - tesseract-ocr-msa \ |
69 |
| - tesseract-ocr-est \ |
70 |
| - # tesseract-ocr-eus \ |
71 |
| - tesseract-ocr-fin \ |
72 |
| - tesseract-ocr-fra \ |
73 |
| - tesseract-ocr-frk \ |
74 |
| - # tesseract-ocr-frm \ |
75 |
| - # tesseract-ocr-glg \ |
76 |
| - # tesseract-ocr-grc \ |
77 |
| - tesseract-ocr-heb \ |
78 |
| - tesseract-ocr-hin \ |
79 |
| - tesseract-ocr-hrv \ |
80 |
| - tesseract-ocr-hye \ |
81 |
| - tesseract-ocr-hun \ |
82 |
| - # tesseract-ocr-ben \ |
83 |
| - tesseract-ocr-bul \ |
84 |
| - tesseract-ocr-cat \ |
85 |
| - tesseract-ocr-ces \ |
86 |
| - tesseract-ocr-nep \ |
87 |
| - # tesseract-ocr-chi_sim \ |
88 |
| - # tesseract-ocr-chi_tra \ |
89 |
| - # tesseract-ocr-chr \ |
90 |
| - tesseract-ocr-dan \ |
91 |
| - tesseract-ocr-deu \ |
92 |
| - tesseract-ocr-ell \ |
93 |
| - # tesseract-ocr-enm \ |
94 |
| - # tesseract-ocr-epo \ |
95 |
| - # tesseract-ocr-equ \ |
96 |
| - tesseract-ocr-afr \ |
97 |
| - tesseract-ocr-ara \ |
98 |
| - tesseract-ocr-aze \ |
99 |
| - tesseract-ocr-bel \ |
100 |
| - tesseract-ocr-uzb \ |
101 |
| - ### pdf convert: libreoffice + a bunch of fonts |
102 |
| - libreoffice fonts-opensymbol hyphen-fr hyphen-de \ |
103 |
| - hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \ |
104 |
| - fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ |
105 |
| - fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ |
106 |
| - fonts-tlwg-purisa \ |
107 |
| - ### |
108 |
| - && apt-get -qq -y autoremove \ |
109 |
| - && apt-get clean \ |
110 |
| - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ |
111 |
| - && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 |
| 9 | +RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \ |
| 10 | + && apt-get -qq -y update \ |
| 11 | + && apt-get -qq -y install build-essential locales ca-certificates \ |
| 12 | + # git |
| 13 | + git \ |
| 14 | + # python deps (mostly to install their dependencies) |
| 15 | + python3-dev \ |
| 16 | + # tesseract |
| 17 | + tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\ |
| 18 | + # libraries |
| 19 | + libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \ |
| 20 | + zlib1g-dev libicu-dev libxml2-dev \ |
| 21 | + # package tools |
| 22 | + unrar p7zip-full \ |
| 23 | + # audio & video metadata |
| 24 | + libmediainfo-dev \ |
| 25 | + # image processing, djvu |
| 26 | + imagemagick-common imagemagick mdbtools djvulibre-bin \ |
| 27 | + libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ |
| 28 | + libtiff-tools ghostscript librsvg2-bin jbig2dec \ |
| 29 | + pst-utils libopenjp2-7-dev libgif-dev libpng-dev \ |
| 30 | + ### tesseract |
| 31 | + tesseract-ocr-eng \ |
| 32 | + tesseract-ocr-swa \ |
| 33 | + tesseract-ocr-swe \ |
| 34 | + # tesseract-ocr-tam \ |
| 35 | + # tesseract-ocr-tel \ |
| 36 | + tesseract-ocr-fil \ |
| 37 | + # tesseract-ocr-tha \ |
| 38 | + tesseract-ocr-tur \ |
| 39 | + tesseract-ocr-ukr \ |
| 40 | + # tesseract-ocr-vie \ |
| 41 | + tesseract-ocr-nld \ |
| 42 | + tesseract-ocr-nor \ |
| 43 | + tesseract-ocr-pol \ |
| 44 | + tesseract-ocr-por \ |
| 45 | + tesseract-ocr-ron \ |
| 46 | + tesseract-ocr-rus \ |
| 47 | + tesseract-ocr-slk \ |
| 48 | + tesseract-ocr-slv \ |
| 49 | + tesseract-ocr-spa \ |
| 50 | + # tesseract-ocr-spa_old \ |
| 51 | + tesseract-ocr-sqi \ |
| 52 | + tesseract-ocr-srp \ |
| 53 | + tesseract-ocr-ind \ |
| 54 | + tesseract-ocr-isl \ |
| 55 | + tesseract-ocr-ita \ |
| 56 | + # tesseract-ocr-ita_old \ |
| 57 | + # tesseract-ocr-jpn \ |
| 58 | + tesseract-ocr-kan \ |
| 59 | + tesseract-ocr-kat \ |
| 60 | + # tesseract-ocr-kor \ |
| 61 | + tesseract-ocr-khm \ |
| 62 | + tesseract-ocr-lav \ |
| 63 | + tesseract-ocr-lit \ |
| 64 | + # tesseract-ocr-mal \ |
| 65 | + tesseract-ocr-mkd \ |
| 66 | + tesseract-ocr-mya \ |
| 67 | + tesseract-ocr-mlt \ |
| 68 | + tesseract-ocr-msa \ |
| 69 | + tesseract-ocr-est \ |
| 70 | + # tesseract-ocr-eus \ |
| 71 | + tesseract-ocr-fin \ |
| 72 | + tesseract-ocr-fra \ |
| 73 | + tesseract-ocr-frk \ |
| 74 | + # tesseract-ocr-frm \ |
| 75 | + # tesseract-ocr-glg \ |
| 76 | + # tesseract-ocr-grc \ |
| 77 | + tesseract-ocr-heb \ |
| 78 | + tesseract-ocr-hin \ |
| 79 | + tesseract-ocr-hrv \ |
| 80 | + tesseract-ocr-hye \ |
| 81 | + tesseract-ocr-hun \ |
| 82 | + # tesseract-ocr-ben \ |
| 83 | + tesseract-ocr-bul \ |
| 84 | + tesseract-ocr-cat \ |
| 85 | + tesseract-ocr-ces \ |
| 86 | + tesseract-ocr-nep \ |
| 87 | + # tesseract-ocr-chi_sim \ |
| 88 | + # tesseract-ocr-chi_tra \ |
| 89 | + # tesseract-ocr-chr \ |
| 90 | + tesseract-ocr-dan \ |
| 91 | + tesseract-ocr-deu \ |
| 92 | + tesseract-ocr-ell \ |
| 93 | + # tesseract-ocr-enm \ |
| 94 | + # tesseract-ocr-epo \ |
| 95 | + # tesseract-ocr-equ \ |
| 96 | + tesseract-ocr-afr \ |
| 97 | + tesseract-ocr-ara \ |
| 98 | + tesseract-ocr-aze \ |
| 99 | + tesseract-ocr-bel \ |
| 100 | + tesseract-ocr-uzb \ |
| 101 | + ### pdf convert: libreoffice + a bunch of fonts |
| 102 | + libreoffice fonts-opensymbol hyphen-fr hyphen-de \ |
| 103 | + hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \ |
| 104 | + fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ |
| 105 | + fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ |
| 106 | + fonts-tlwg-purisa \ |
| 107 | + ### |
| 108 | + && apt-get -qq -y autoremove \ |
| 109 | + && apt-get clean \ |
| 110 | + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ |
| 111 | + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 |
112 | 112 |
|
113 | 113 | # Set up the locale and make sure the system uses unicode for the file system.
|
114 | 114 | ENV LANG='en_US.UTF-8' \
|
115 |
| - TZ='UTC' \ |
116 |
| - OMP_THREAD_LIMIT='1' \ |
117 |
| - OPENBLAS_NUM_THREADS='1' |
| 115 | + TZ='UTC' \ |
| 116 | + OMP_THREAD_LIMIT='1' \ |
| 117 | + OPENBLAS_NUM_THREADS='1' |
118 | 118 |
|
119 | 119 | RUN groupadd -g 1000 -r app \
|
120 |
| - && useradd -m -u 1000 -s /bin/false -g app app |
| 120 | + && useradd -m -u 1000 -s /bin/false -g app app |
121 | 121 |
|
122 | 122 | # Download the ftm-typepredict model
|
123 | 123 | RUN mkdir /models/ && \
|
124 |
| - curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" |
| 124 | + curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" |
125 | 125 |
|
126 | 126 | COPY requirements.txt /tmp/
|
127 |
| -RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip |
128 |
| -RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel |
129 | 127 | RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
|
130 | 128 |
|
131 | 129 | # Install spaCy models
|
132 | 130 | RUN python3 -m spacy download en_core_web_sm \
|
133 |
| - && python3 -m spacy download de_core_news_sm \ |
134 |
| - && python3 -m spacy download fr_core_news_sm \ |
135 |
| - && python3 -m spacy download es_core_news_sm |
| 131 | + && python3 -m spacy download de_core_news_sm \ |
| 132 | + && python3 -m spacy download fr_core_news_sm \ |
| 133 | + && python3 -m spacy download es_core_news_sm |
136 | 134 | RUN python3 -m spacy download ru_core_news_sm \
|
137 |
| - && python3 -m spacy download pt_core_news_sm \ |
138 |
| - && python3 -m spacy download ro_core_news_sm \ |
139 |
| - && python3 -m spacy download mk_core_news_sm |
| 135 | + && python3 -m spacy download pt_core_news_sm \ |
| 136 | + && python3 -m spacy download ro_core_news_sm \ |
| 137 | + && python3 -m spacy download mk_core_news_sm |
140 | 138 | RUN python3 -m spacy download el_core_news_sm \
|
141 |
| - && python3 -m spacy download pl_core_news_sm \ |
142 |
| - && python3 -m spacy download it_core_news_sm \ |
143 |
| - && python3 -m spacy download lt_core_news_sm \ |
144 |
| - && python3 -m spacy download nl_core_news_sm \ |
145 |
| - && python3 -m spacy download nb_core_news_sm \ |
146 |
| - && python3 -m spacy download da_core_news_sm |
| 139 | + && python3 -m spacy download pl_core_news_sm \ |
| 140 | + && python3 -m spacy download it_core_news_sm \ |
| 141 | + && python3 -m spacy download lt_core_news_sm \ |
| 142 | + && python3 -m spacy download nl_core_news_sm \ |
| 143 | + && python3 -m spacy download nb_core_news_sm \ |
| 144 | + && python3 -m spacy download da_core_news_sm |
147 | 145 | # RUN python3 -m spacy download zh_core_web_sm
|
148 | 146 |
|
149 | 147 | COPY . /ingestors
|
150 | 148 | WORKDIR /ingestors
|
151 |
| -RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors |
| 149 | +RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors |
152 | 150 | RUN chown -R app:app /ingestors
|
153 | 151 |
|
154 | 152 | ENV ARCHIVE_TYPE=file \
|
155 |
| - ARCHIVE_PATH=/data \ |
156 |
| - FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ |
157 |
| - REDIS_URL=redis://redis:6379/0 \ |
158 |
| - TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata \ |
159 |
| - LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 |
| 153 | + ARCHIVE_PATH=/data \ |
| 154 | + FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ |
| 155 | + REDIS_URL=redis://redis:6379/0 \ |
| 156 | + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \ |
| 157 | + LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 |
160 | 158 |
|
161 | 159 | # USER app
|
162 | 160 | CMD ingestors process
|
0 commit comments