pyexcel
diff --git a/‎.moban.yml
Lines changed: 0 additions & 1 deletion b/‎.moban.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎LICENSE
Lines changed: 25 additions & 1 deletion b/‎LICENSE
Lines changed: 25 additions & 1 deletion
diff --git a/‎commons
Lines changed: 0 additions & 1 deletion b/‎commons
Lines changed: 0 additions & 1 deletion
diff --git a/‎pyexcel_pdfr/__init__.py
Lines changed: 24 additions & 1 deletion b/‎pyexcel_pdfr/__init__.py
Lines changed: 24 additions & 1 deletion
diff --git a/‎pyexcel_pdfr/pdfr.py
Lines changed: 196 additions & 0 deletions b/‎pyexcel_pdfr/pdfr.py
Lines changed: 196 additions & 0 deletions
diff --git a/‎setupmobans
Lines changed: 0 additions & 1 deletion b/‎setupmobans
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/fixtures/CBP-7857.pdf
376 KB b/‎tests/fixtures/CBP-7857.pdf
376 KB
diff --git a/‎tests/fixtures/simple.pdf
55.9 KB b/‎tests/fixtures/simple.pdf
55.9 KB
diff --git a/‎tests/test_pdfr.py
Lines changed: 23 additions & 0 deletions b/‎tests/test_pdfr.py
Lines changed: 23 additions & 0 deletions
@@ -9,7 +9,6 @@ targets:
   - README.rst: README.rst
   - setup.py: setup.py
   - requirements.txt: requirements.txt
-  - LICENSE: NEW_BSD_LICENSE.jj2
   - MANIFEST.in: MANIFEST.in.jj2
   - "tests/requirements.txt": "tests/requirements.txt"
   - test.sh: test.script.jj2
 
@@ -16,6 +16,10 @@ that the following conditions are met:
   may not be used to endorse or promote products derived from this software
   without specific prior written permission.
 
+Please also note that this library contains a few functions and test fixtures
+from messytables which is under MIT license and please see their license
+at the end.
+
 THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
 NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,4 +31,24 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
+DAMAGE.
+
+Copyright (c) 2012-2017 The Open Knowledge Foundation Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1 +1,24 @@
-from ._version import __version__, __author__  # flake8: noqa
+"""
+    pyexcel_pdfr
+    ~~~~~~~~~~~~~~~~~~~
+    Read html table using messytables
+    :copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
+    :license: New BSD License
+"""
+from ._version import __version__, __author__  # flake8: noqa
+from pyexcel_io.plugins import IOPluginInfoChain
+from pyexcel_io.io import get_data as read_data, isstream
+
+__FILE_TYPE__ = 'pdf'
+IOPluginInfoChain(__name__).add_a_reader(
+    relative_plugin_class_path='pdfr.PdfFile',
+    file_types=[__FILE_TYPE__],
+    stream_type='binary'
+)
+
+
+def get_data(afile, file_type=None, **keywords):
+    """standalone module function for reading module supported file type"""
+    if isstream(afile) and file_type is None:
+        file_type = __FILE_TYPE__
+    return read_data(afile, file_type=file_type, **keywords)
@@ -0,0 +1,196 @@
+"""
+    pyexcel_pdfr.pdfr
+    ~~~~~~~~~~~~~~~~~~~
+    html table reader using messytables
+
+    :copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
+    :license: New BSD License
+"""
+import re
+import datetime
+
+from pdftables import get_tables
+from pyexcel_io.book import BookReader
+from pyexcel_io.sheet import SheetReader, NamedContent
+from pyexcel_io._compact import OrderedDict
+
+
+class PdfTable(SheetReader):
+    def __init__(self, sheet, auto_detect_int=True,
+                 auto_detect_float=True,
+                 auto_detect_datetime=True,
+                 **keywords):
+        SheetReader.__init__(self, sheet, **keywords)
+        self.__auto_detect_int = auto_detect_int
+        self.__auto_detect_float = auto_detect_float
+        self.__auto_detect_datetime = auto_detect_datetime
+        self.__table = self._native_sheet.payload
+        self.__column_span = {}
+
+    @property
+    def name(self):
+        return self._native_sheet.name
+
+    def row_iterator(self):
+        if hasattr(self.__table, "cell_data"):
+            # New style of cell data.
+            for row in self.__table.cell_data:
+                yield [pdf_cell for pdf_cell in row]
+        else:
+            for row in self.__table:
+                yield [pdf_cell for pdf_cell in row]
+
+    def column_iterator(self, row):
+        index = 0
+        for cell in row:
+            # generate '' due to previous rowspan
+            while index in self.__column_span:
+                # and keep generating '' if next index is in the list
+                self.__column_span[index] -= 1
+                if self.__column_span[index] == 0:
+                    del self.__column_span[index]
+                yield ''
+                index += 1
+
+            if not hasattr(cell, 'topleft'):
+                yield cell
+                index += 1
+                continue
+
+            col_span, row_span = cell.size
+            yield self.__convert_cell(cell.content)
+            if row_span > 1:
+                # generate '' due to colspan
+                if col_span > 1:
+                    for offset in range(row_span):
+                        if offset > 0:
+                            # for next cell, give full col span
+                            self.__column_span[index+offset] = col_span
+                        else:
+                            # for current cell, give -1 because it has been
+                            # yielded
+                            self.__column_span[index+offset] = col_span - 1
+                else:
+                    # no col span found, so just repeat in the same row
+                    for _ in range(row_span-1):
+                        yield ''
+                        index += 1
+            else:
+                if col_span > 1:
+                    self.__column_span[index] = col_span - 1
+            # next index
+            index += 1
+
+    def __convert_cell(self, cell_text):
+        ret = None
+        if self.__auto_detect_int:
+            ret = _detect_int_value(cell_text)
+        if ret is None and self.__auto_detect_float:
+            ret = _detect_float_value(cell_text)
+            shall_we_ignore_the_conversion = (
+                (ret in [float('inf'), float('-inf')]) and
+                self.__ignore_infinity
+            )
+            if shall_we_ignore_the_conversion:
+                ret = None
+        if ret is None and self.__auto_detect_datetime:
+            ret = _detect_date_value(cell_text)
+        if ret is None:
+            ret = cell_text
+        return ret
+
+
+class PdfFile(BookReader):
+    def __init__(self):
+        BookReader.__init__(self)
+        self._file_handle = None
+
+    def open(self, file_name, **keywords):
+        BookReader.open(self, file_name, **keywords)
+        self._load_from_file()
+
+    def open_stream(self, file_stream, **keywords):
+        BookReader.open_stream(self, file_stream, **keywords)
+        self._load_from_memory()
+
+    def read_all(self):
+        result = OrderedDict()
+        for sheet in self._native_book:
+            result.update(self.read_sheet(sheet))
+        return result
+
+    def read_sheet(self, native_sheet):
+        sheet = PdfTable(native_sheet, **self._keywords)
+        return {sheet.name: sheet.to_array()}
+
+    def _load_from_file(self):
+        self._file_handle = open(self._file_name, 'rb')
+        self._native_book = self._parse_pdf(self._file_handle)
+
+    def _load_from_memory(self):
+        self._native_book = self._parse_pdf(self._file_stream)
+
+    def _parse_pdf(self, file_handle):
+        for table in get_tables(file_handle):
+            name = "Table {0} of {1} on page {2} of {3}".format(
+                table.table_number_on_page,
+                table.total_tables_on_page,
+                table.page_number,
+                table.total_pages)
+            yield NamedContent(name, table)
+
+    def close(self):
+        if self._file_handle:
+            self._file_handle.close()
+
+
+def _detect_date_value(csv_cell_text):
+    """
+    Read the date formats that were written by csv.writer
+    """
+    ret = None
+    try:
+        if len(csv_cell_text) == 10:
+            ret = datetime.datetime.strptime(
+                csv_cell_text,
+                "%Y-%m-%d")
+            ret = ret.date()
+        elif len(csv_cell_text) == 19:
+            ret = datetime.datetime.strptime(
+                csv_cell_text,
+                "%Y-%m-%d %H:%M:%S")
+        elif len(csv_cell_text) > 19:
+            ret = datetime.datetime.strptime(
+                csv_cell_text[0:26],
+                "%Y-%m-%d %H:%M:%S.%f")
+    except ValueError:
+        pass
+    return ret
+
+
+def _detect_float_value(csv_cell_text):
+    try:
+        should_we_skip_it = (csv_cell_text.startswith('0') and
+                             csv_cell_text.startswith('0.') is False)
+        if should_we_skip_it:
+            # do not convert if a number starts with 0
+            # e.g. 014325
+            return None
+        else:
+            return float(csv_cell_text)
+    except ValueError:
+        return None
+
+
+def _detect_int_value(csv_cell_text):
+    if csv_cell_text.startswith('0') and len(csv_cell_text) > 1:
+        return None
+    try:
+        return int(csv_cell_text)
+    except ValueError:
+        pattern = '([0-9]+,)*[0-9]+$'
+        if re.match(pattern, csv_cell_text):
+            integer_string = csv_cell_text.replace(',', '')
+            return int(integer_string)
+        else:
+            return None
@@ -0,0 +1,23 @@
+import os
+from nose.tools import eq_
+import pyexcel as p
+
+
+def test_simple_pdf():
+    book = p.get_book(file_name=get_fixtures('simple.pdf'))
+    eq_(book.number_of_sheets(), 1)
+    eq_(book[0].name, 'Table 1 of 1 on page 1 of 1')
+
+
+def test_complex_pdf():
+    book = p.get_book(file_name=get_fixtures('CBP-7857.pdf'))
+    peer_look = [
+        u'16 Higher education', u'stude', u'nt', u'numb', u'ers',
+        '', '', '', '', '', '', '', '', '', '', '', '', '', '',
+        '', '', '', '', '']
+    eq_(book.number_of_sheets(), 5)
+    eq_(book.Table_1_of_1_on_page_16_of_17.row[0], peer_look)
+
+
+def get_fixtures(file_name):
+    return os.path.join("tests", "fixtures", file_name)