Skip to content

Commit 5d2e4b3

Browse files
improved pdf_converter (#317)
* improved pdf_converter * fix test
1 parent 424e450 commit 5d2e4b3

File tree

3 files changed

+7
-7
lines changed

3 files changed

+7
-7
lines changed

cdqa/utils/converters.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
170170
try:
171171
df.loc[i] = [pdf.replace(".pdf",''), None]
172172
raw = parser.from_file(os.path.join(directory_path, pdf))
173-
s = raw["content"]
174-
paragraphs = re.split("\n(?=\u2028|[A-Z-0-9])", s)
173+
s = raw["content"].strip()
174+
paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s)
175175
list_par = []
176176
temp_para = "" # variable that stores paragraphs with length<min_length
177177
# (considered as a line)
@@ -198,9 +198,9 @@ def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
198198
else:
199199
# appending paragraph p as is to list_par
200200
list_par.append(p.replace("\n", ""))
201-
else:
202-
if temp_para:
203-
list_par.append(temp_para.strip())
201+
else:
202+
if temp_para:
203+
list_par.append(temp_para.strip())
204204

205205
df.loc[i, "paragraphs"] = list_par
206206
except:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def read(file):
88

99
setup(
1010
name="cdqa",
11-
version="1.3.8",
11+
version="1.3.9",
1212
author="Félix MIKAELIAN, André FARIAS, Matyas AMROUCHE, Olivier SANS, Théo NAZON",
1313
description="An End-To-End Closed Domain Question Answering System",
1414
long_description=read("README.md"),

tests/test_converters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def df_converter_check(self, df, include_line_breaks=False):
3939
if include_line_breaks:
4040
para_len = [len(df.paragraphs[i]) for i in range(df.shape[0])]
4141
para_len.sort()
42-
if not para_len == [144, 220, 265]:
42+
if not para_len == [58, 80, 87]:
4343
errors.append(f"error in number of paragraphs : {para_len}")
4444

4545
# assert no error message has been registered, else print messages

0 commit comments

Comments
 (0)