Skip to content

Commit d549fdf

Browse files
committed
✨ first compactibility with messytables' pdf
1 parent bcb0fcc commit d549fdf

File tree

9 files changed

+268
-5
lines changed

9 files changed

+268
-5
lines changed

.moban.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ targets:
99
- README.rst: README.rst
1010
- setup.py: setup.py
1111
- requirements.txt: requirements.txt
12-
- LICENSE: NEW_BSD_LICENSE.jj2
1312
- MANIFEST.in: MANIFEST.in.jj2
1413
- "tests/requirements.txt": "tests/requirements.txt"
1514
- test.sh: test.script.jj2

LICENSE

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ that the following conditions are met:
1616
may not be used to endorse or promote products derived from this software
1717
without specific prior written permission.
1818

19+
Please also note that this library contains a few functions and test fixtures
20+
from messytables which is under MIT license and please see their license
21+
at the end.
22+
1923
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND
2024
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
2125
NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,4 +31,24 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
2731
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
2832
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2933
SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30-
DAMAGE.
34+
DAMAGE.
35+
36+
Copyright (c) 2012-2017 The Open Knowledge Foundation Ltd.
37+
38+
Permission is hereby granted, free of charge, to any person obtaining a copy of
39+
this software and associated documentation files (the "Software"), to deal in
40+
the Software without restriction, including without limitation the rights to
41+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
42+
of the Software, and to permit persons to whom the Software is furnished to do
43+
so, subject to the following conditions:
44+
45+
The above copyright notice and this permission notice shall be included in all
46+
copies or substantial portions of the Software.
47+
48+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
53+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54+
SOFTWARE.

commons

Lines changed: 0 additions & 1 deletion
This file was deleted.

pyexcel_pdfr/__init__.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,24 @@
1-
from ._version import __version__, __author__ # flake8: noqa
1+
"""
2+
pyexcel_pdfr
3+
~~~~~~~~~~~~~~~~~~~
4+
Read html table using messytables
5+
:copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
6+
:license: New BSD License
7+
"""
8+
from ._version import __version__, __author__ # flake8: noqa
9+
from pyexcel_io.plugins import IOPluginInfoChain
10+
from pyexcel_io.io import get_data as read_data, isstream
11+
12+
__FILE_TYPE__ = 'pdf'
13+
IOPluginInfoChain(__name__).add_a_reader(
14+
relative_plugin_class_path='pdfr.PdfFile',
15+
file_types=[__FILE_TYPE__],
16+
stream_type='binary'
17+
)
18+
19+
20+
def get_data(afile, file_type=None, **keywords):
21+
"""standalone module function for reading module supported file type"""
22+
if isstream(afile) and file_type is None:
23+
file_type = __FILE_TYPE__
24+
return read_data(afile, file_type=file_type, **keywords)

pyexcel_pdfr/pdfr.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
"""
2+
pyexcel_pdfr.pdfr
3+
~~~~~~~~~~~~~~~~~~~
4+
html table reader using messytables
5+
6+
:copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
7+
:license: New BSD License
8+
"""
9+
import re
10+
import datetime
11+
12+
from pdftables import get_tables
13+
from pyexcel_io.book import BookReader
14+
from pyexcel_io.sheet import SheetReader, NamedContent
15+
from pyexcel_io._compact import OrderedDict
16+
17+
18+
class PdfTable(SheetReader):
19+
def __init__(self, sheet, auto_detect_int=True,
20+
auto_detect_float=True,
21+
auto_detect_datetime=True,
22+
**keywords):
23+
SheetReader.__init__(self, sheet, **keywords)
24+
self.__auto_detect_int = auto_detect_int
25+
self.__auto_detect_float = auto_detect_float
26+
self.__auto_detect_datetime = auto_detect_datetime
27+
self.__table = self._native_sheet.payload
28+
self.__column_span = {}
29+
30+
@property
31+
def name(self):
32+
return self._native_sheet.name
33+
34+
def row_iterator(self):
35+
if hasattr(self.__table, "cell_data"):
36+
# New style of cell data.
37+
for row in self.__table.cell_data:
38+
yield [pdf_cell for pdf_cell in row]
39+
else:
40+
for row in self.__table:
41+
yield [pdf_cell for pdf_cell in row]
42+
43+
def column_iterator(self, row):
44+
index = 0
45+
for cell in row:
46+
# generate '' due to previous rowspan
47+
while index in self.__column_span:
48+
# and keep generating '' if next index is in the list
49+
self.__column_span[index] -= 1
50+
if self.__column_span[index] == 0:
51+
del self.__column_span[index]
52+
yield ''
53+
index += 1
54+
55+
if not hasattr(cell, 'topleft'):
56+
yield cell
57+
index += 1
58+
continue
59+
60+
col_span, row_span = cell.size
61+
yield self.__convert_cell(cell.content)
62+
if row_span > 1:
63+
# generate '' due to colspan
64+
if col_span > 1:
65+
for offset in range(row_span):
66+
if offset > 0:
67+
# for next cell, give full col span
68+
self.__column_span[index+offset] = col_span
69+
else:
70+
# for current cell, give -1 because it has been
71+
# yielded
72+
self.__column_span[index+offset] = col_span - 1
73+
else:
74+
# no col span found, so just repeat in the same row
75+
for _ in range(row_span-1):
76+
yield ''
77+
index += 1
78+
else:
79+
if col_span > 1:
80+
self.__column_span[index] = col_span - 1
81+
# next index
82+
index += 1
83+
84+
def __convert_cell(self, cell_text):
85+
ret = None
86+
if self.__auto_detect_int:
87+
ret = _detect_int_value(cell_text)
88+
if ret is None and self.__auto_detect_float:
89+
ret = _detect_float_value(cell_text)
90+
shall_we_ignore_the_conversion = (
91+
(ret in [float('inf'), float('-inf')]) and
92+
self.__ignore_infinity
93+
)
94+
if shall_we_ignore_the_conversion:
95+
ret = None
96+
if ret is None and self.__auto_detect_datetime:
97+
ret = _detect_date_value(cell_text)
98+
if ret is None:
99+
ret = cell_text
100+
return ret
101+
102+
103+
class PdfFile(BookReader):
104+
def __init__(self):
105+
BookReader.__init__(self)
106+
self._file_handle = None
107+
108+
def open(self, file_name, **keywords):
109+
BookReader.open(self, file_name, **keywords)
110+
self._load_from_file()
111+
112+
def open_stream(self, file_stream, **keywords):
113+
BookReader.open_stream(self, file_stream, **keywords)
114+
self._load_from_memory()
115+
116+
def read_all(self):
117+
result = OrderedDict()
118+
for sheet in self._native_book:
119+
result.update(self.read_sheet(sheet))
120+
return result
121+
122+
def read_sheet(self, native_sheet):
123+
sheet = PdfTable(native_sheet, **self._keywords)
124+
return {sheet.name: sheet.to_array()}
125+
126+
def _load_from_file(self):
127+
self._file_handle = open(self._file_name, 'rb')
128+
self._native_book = self._parse_pdf(self._file_handle)
129+
130+
def _load_from_memory(self):
131+
self._native_book = self._parse_pdf(self._file_stream)
132+
133+
def _parse_pdf(self, file_handle):
134+
for table in get_tables(file_handle):
135+
name = "Table {0} of {1} on page {2} of {3}".format(
136+
table.table_number_on_page,
137+
table.total_tables_on_page,
138+
table.page_number,
139+
table.total_pages)
140+
yield NamedContent(name, table)
141+
142+
def close(self):
143+
if self._file_handle:
144+
self._file_handle.close()
145+
146+
147+
def _detect_date_value(csv_cell_text):
148+
"""
149+
Read the date formats that were written by csv.writer
150+
"""
151+
ret = None
152+
try:
153+
if len(csv_cell_text) == 10:
154+
ret = datetime.datetime.strptime(
155+
csv_cell_text,
156+
"%Y-%m-%d")
157+
ret = ret.date()
158+
elif len(csv_cell_text) == 19:
159+
ret = datetime.datetime.strptime(
160+
csv_cell_text,
161+
"%Y-%m-%d %H:%M:%S")
162+
elif len(csv_cell_text) > 19:
163+
ret = datetime.datetime.strptime(
164+
csv_cell_text[0:26],
165+
"%Y-%m-%d %H:%M:%S.%f")
166+
except ValueError:
167+
pass
168+
return ret
169+
170+
171+
def _detect_float_value(csv_cell_text):
172+
try:
173+
should_we_skip_it = (csv_cell_text.startswith('0') and
174+
csv_cell_text.startswith('0.') is False)
175+
if should_we_skip_it:
176+
# do not convert if a number starts with 0
177+
# e.g. 014325
178+
return None
179+
else:
180+
return float(csv_cell_text)
181+
except ValueError:
182+
return None
183+
184+
185+
def _detect_int_value(csv_cell_text):
186+
if csv_cell_text.startswith('0') and len(csv_cell_text) > 1:
187+
return None
188+
try:
189+
return int(csv_cell_text)
190+
except ValueError:
191+
pattern = '([0-9]+,)*[0-9]+$'
192+
if re.match(pattern, csv_cell_text):
193+
integer_string = csv_cell_text.replace(',', '')
194+
return int(integer_string)
195+
else:
196+
return None

setupmobans

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/fixtures/CBP-7857.pdf

376 KB
Binary file not shown.

tests/fixtures/simple.pdf

55.9 KB
Binary file not shown.

tests/test_pdfr.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import os
2+
from nose.tools import eq_
3+
import pyexcel as p
4+
5+
6+
def test_simple_pdf():
7+
book = p.get_book(file_name=get_fixtures('simple.pdf'))
8+
eq_(book.number_of_sheets(), 1)
9+
eq_(book[0].name, 'Table 1 of 1 on page 1 of 1')
10+
11+
12+
def test_complex_pdf():
13+
book = p.get_book(file_name=get_fixtures('CBP-7857.pdf'))
14+
peer_look = [
15+
u'16 Higher education', u'stude', u'nt', u'numb', u'ers',
16+
'', '', '', '', '', '', '', '', '', '', '', '', '', '',
17+
'', '', '', '', '']
18+
eq_(book.number_of_sheets(), 5)
19+
eq_(book.Table_1_of_1_on_page_16_of_17.row[0], peer_look)
20+
21+
22+
def get_fixtures(file_name):
23+
return os.path.join("tests", "fixtures", file_name)

0 commit comments

Comments
 (0)