Skip to content

Title fix #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions chemdataextractor/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def records(self):
sent_record = first_sent_records[0]
if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2):
head_def_record = sent_record
head_def_record_i = i
head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well

for record in el.records:
# Keep track of the most recent record with labels
Expand Down Expand Up @@ -215,10 +215,11 @@ def records(self):
continue
else:
# print(record.serialize())
# TODO: check the names and labels, not the whole record
# We have property values but no names or labels... try merge those from previous
if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record):
# head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name)
if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
if head_def_record:
record.names = head_def_record.names
record.labels = head_def_record.labels
Expand Down Expand Up @@ -272,6 +273,13 @@ def records(self):
record.names.append(name)

# Merge records with any shared name/label
temp_record = []
for record in records:
if len(record.labels) <= 1:
temp_record.append(record)

records.models = temp_record

len_l = len(records)
i = 0
while i < (len_l - 1):
Expand Down
79 changes: 73 additions & 6 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,23 @@
from chemdataextractor import Document
from chemdataextractor.doc import Heading, Paragraph


logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)


unittest.util._MAX_LENGTH = 2000


class TestExtract(unittest.TestCase):

maxDiff = None

def test_melting_point_heading_salt(self):
"""Test extraction of melting point from a heading and paragraphs. Example taken from patent US06840965B2."""
d = Document(
Heading('D. Synthesis of 4-Amino-2-(3-thienyl)phenol Hydrochloride'),
Paragraph('3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'),
Paragraph('This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.')
Paragraph(
'3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'),
Paragraph(
'This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.')

)
expected = [
Expand All @@ -44,7 +43,9 @@ def test_melting_point_heading_salt(self):
{'names': ['carbon']},
{'names': ['hydrogen']},
{'names': ['diethyl ether']},
{'melting_points': [{'units': '°C', 'value': '130-132'}], 'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'], 'roles': ['product']}
{'melting_points': [{'units': '°C', 'value': '130-132'}],
'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'],
'roles': ['product']}
]
self.assertEqual(expected, d.records.serialize())

Expand All @@ -55,7 +56,73 @@ def test_parse_control_character(self):
expected = [{'names': ['2,4,6-trinitrotoluene']}]
self.assertEqual(expected, d.records.serialize())

def test_title_parse(self):
"""Test heading managed correctly"""
d = Document(
Heading('3.2. Experimental Details'),
Heading('3.2.1. Synthesis of Phosphorus Ylide 5'),
Paragraph('N-Benzyl-2-chloroacetamide (2): Chloroacetamide 2 was prepared following the procedure described in the literature [23]. To a stirred solution of benzylamine (7.8 mL, 70.8 mmol) in toluene (60 mL) under cooling with ice bath, chloroacetyl chloride (4 g, 35.4 mmol) was slowly added. The reaction mixture was stirred vigorously for 1h at room temperature. The solvent was evaporated under vacuum, the crude reaction was dissolved in dichloromethane (100 mL) and washed with water (3 × 50 mL). The organic layer was dried over anhydrous MgSO4, filtered and the solvent evaporated under vacuum. The product was obtained as a white solid (6.30 g, 97%). m.p. 91–92 °C (93–96 °C from literature) [23]; 1H-NMR (CDCl3) δ 4.11 (s, 2H), 4.50 (d, 2H, J = 6.0 Hz), 6.89 (br s, 1H), 7.26–7.36 (m, 5H, Ar-H).'),
Paragraph('1-Benzyl-5-(chloromethyl)-1H-tetrazole (3): Compound 3 was prepared by an analogous method to that described in the literature [24]. PCl5 (7.06 g, 33.9 mmol) was added slowly to a solution of N-benzyl-2-chloroacetamide (5.66 g, 30.8 mmol) in toluene (50 mL) under cooling with ice-water bath. The mixture was stirred at room temperature for 2 h, then NaN3 (3.01 g, 46.3 mmol) was added. The reaction mixture was stirred at room temperature for 30 min, water (0.8 mL) was added dropwise and the whole was refluxed for 5 h. After cooling, the reaction mixture was poured into water and extracted with chloroform. The combined organic layers were washed successively with water, NaOH solution 1M and saturated NaCl solution and dried over anhydrous MgSO4. After removal of the solvent, the crude product was purified by flash chromatography (ethyl acetate/hexane (1:2)) affording the tetrazole 3 as light yellow solid (3.47 g, 54%). m.p. 57–59 °C (from diethyl ether) (62–63 °C from literature) [24]; 1H-NMR (CDCl3) δ (ppm) 4.62 (s, 2H), 5.68 (s, 2H), 7.28–7.30 (m, 2H, Ar-H), 7.39–7.40 (m, 3H, Ar-H).')
)
expected = [
{'roles': ['product'], 'names': ['Phosphorus Ylide 5']},
{'names': ['Chloroacetamide']},
{'names': ['benzylamine']},
{'names': ['chloroacetyl chloride']},
{'names': ['dichloromethane']},
{'names': ['1H']},
{'names': ['PCl5']},
{'names': ['NaN3']},
{'names': ['chloroform']},
{'names': ['NaOH']},
{'names': ['NaCl']},
{'names': ['ethyl acetate']},
{'names': ['hexane']},
{'names': ['tetrazole']},
{'names': ['diethyl ether']},
{'names': ['toluene']},
{'names': ['MgSO4']},
{'names': ['1H-NMR']},
{'names': ['CDCl3']},
{'names': ['2H']},
{'names': ['Ar-H']},
{
'melting_points': [
{'units': '°C', 'value': '57–59'}
],
'nmr_spectra': [{
'peaks': [
{'shift': '4.62', 'number': '2H', 'multiplicity': 's'},
{'shift': '5.68', 'number': '2H', 'multiplicity': 's'},
{'shift': '7.28–7.30', 'number': '2H', 'assignment': 'Ar-H', 'multiplicity': 'm'},
{'shift': '7.39–7.40', 'number': '3H', 'assignment': 'Ar-H', 'multiplicity': 'm'}
],
'solvent': 'CDCl3',
'nucleus': '1H'}
],
'names': ['1-Benzyl-5-(chloromethyl)-1H-tetrazole'],
'labels': ['3']
},
{
'melting_points': [
{'units': '°C', 'value': '91–92'}
],
'nmr_spectra': [{
'peaks': [
{'shift': '4.11', 'number': '2H', 'multiplicity': 's'},
{'coupling': '6.0', 'number': '2H', 'shift': '4.50', 'coupling_units': 'Hz',
'multiplicity': 'd'},
{'shift': '6.89', 'number': '1H', 'multiplicity': 'br s'},
{'shift': '7.26–7.36', 'number': '5H', 'assignment': 'Ar-H', 'multiplicity': 'm'}
],
'solvent': 'CDCl3',
'nucleus': '1H'}
],
'names': ['N-Benzyl-2-chloroacetamide', 'N-benzyl-2-chloroacetamide'], # even with this two repeated names, the extractor is working ok
'labels': ['2']
}]

self.assertEqual(expected, d.records.serialize())


if __name__ == '__main__':
Expand Down
5 changes: 5 additions & 0 deletions tests/test_parse_cem.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,11 @@ def test_consecutive_headings2(self):
Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.')
)
results = [r.serialize() for r in d.records]
print(results)
print([
{'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']},
{'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']}
])
self.assertEqual(results, [
{'labels': [u'VII'], 'roles': [u'formula']},
{'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}],
Expand Down