diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index 3d72760..90eadaa 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -185,7 +185,7 @@ def records(self): sent_record = first_sent_records[0] if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2): head_def_record = sent_record - head_def_record_i = i + head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well for record in el.records: # Keep track of the most recent record with labels @@ -215,10 +215,11 @@ def records(self): continue else: # print(record.serialize()) + # TODO: check the names and labels, not the whole record # We have property values but no names or labels... try merge those from previous if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record): # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name) - if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): + if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): if head_def_record: record.names = head_def_record.names record.labels = head_def_record.labels @@ -272,6 +273,13 @@ def records(self): record.names.append(name) # Merge records with any shared name/label + temp_record = [] + for record in records: + if len(record.labels) <= 1: + temp_record.append(record) + + records.models = temp_record + len_l = len(records) i = 0 while i < (len_l - 1): diff --git a/tests/test_extract.py b/tests/test_extract.py index 99c0f4a..35e4c6f 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -17,24 +17,23 @@ from chemdataextractor import Document from chemdataextractor.doc import Heading, Paragraph - logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) - unittest.util._MAX_LENGTH = 2000 class TestExtract(unittest.TestCase): - maxDiff = None def test_melting_point_heading_salt(self): """Test extraction of melting point from a heading and paragraphs. Example taken from patent US06840965B2.""" d = Document( Heading('D. Synthesis of 4-Amino-2-(3-thienyl)phenol Hydrochloride'), - Paragraph('3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'), - Paragraph('This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.') + Paragraph( + '3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'), + Paragraph( + 'This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.') ) expected = [ @@ -44,7 +43,9 @@ def test_melting_point_heading_salt(self): {'names': ['carbon']}, {'names': ['hydrogen']}, {'names': ['diethyl ether']}, - {'melting_points': [{'units': '°C', 'value': '130-132'}], 'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'], 'roles': ['product']} + {'melting_points': [{'units': '°C', 'value': '130-132'}], + 'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'], + 'roles': ['product']} ] self.assertEqual(expected, d.records.serialize()) @@ -55,7 +56,73 @@ def test_parse_control_character(self): expected = [{'names': ['2,4,6-trinitrotoluene']}] self.assertEqual(expected, d.records.serialize()) + def test_title_parse(self): + """Test heading managed correctly""" + d = Document( + Heading('3.2. Experimental Details'), + Heading('3.2.1. Synthesis of Phosphorus Ylide 5'), + Paragraph('N-Benzyl-2-chloroacetamide (2): Chloroacetamide 2 was prepared following the procedure described in the literature [23]. To a stirred solution of benzylamine (7.8 mL, 70.8 mmol) in toluene (60 mL) under cooling with ice bath, chloroacetyl chloride (4 g, 35.4 mmol) was slowly added. The reaction mixture was stirred vigorously for 1h at room temperature. The solvent was evaporated under vacuum, the crude reaction was dissolved in dichloromethane (100 mL) and washed with water (3 × 50 mL). The organic layer was dried over anhydrous MgSO4, filtered and the solvent evaporated under vacuum. The product was obtained as a white solid (6.30 g, 97%). m.p. 91–92 °C (93–96 °C from literature) [23]; 1H-NMR (CDCl3) δ 4.11 (s, 2H), 4.50 (d, 2H, J = 6.0 Hz), 6.89 (br s, 1H), 7.26–7.36 (m, 5H, Ar-H).'), + Paragraph('1-Benzyl-5-(chloromethyl)-1H-tetrazole (3): Compound 3 was prepared by an analogous method to that described in the literature [24]. PCl5 (7.06 g, 33.9 mmol) was added slowly to a solution of N-benzyl-2-chloroacetamide (5.66 g, 30.8 mmol) in toluene (50 mL) under cooling with ice-water bath. The mixture was stirred at room temperature for 2 h, then NaN3 (3.01 g, 46.3 mmol) was added. The reaction mixture was stirred at room temperature for 30 min, water (0.8 mL) was added dropwise and the whole was refluxed for 5 h. After cooling, the reaction mixture was poured into water and extracted with chloroform. The combined organic layers were washed successively with water, NaOH solution 1M and saturated NaCl solution and dried over anhydrous MgSO4. After removal of the solvent, the crude product was purified by flash chromatography (ethyl acetate/hexane (1:2)) affording the tetrazole 3 as light yellow solid (3.47 g, 54%). m.p. 57–59 °C (from diethyl ether) (62–63 °C from literature) [24]; 1H-NMR (CDCl3) δ (ppm) 4.62 (s, 2H), 5.68 (s, 2H), 7.28–7.30 (m, 2H, Ar-H), 7.39–7.40 (m, 3H, Ar-H).') + ) + expected = [ + {'roles': ['product'], 'names': ['Phosphorus Ylide 5']}, + {'names': ['Chloroacetamide']}, + {'names': ['benzylamine']}, + {'names': ['chloroacetyl chloride']}, + {'names': ['dichloromethane']}, + {'names': ['1H']}, + {'names': ['PCl5']}, + {'names': ['NaN3']}, + {'names': ['chloroform']}, + {'names': ['NaOH']}, + {'names': ['NaCl']}, + {'names': ['ethyl acetate']}, + {'names': ['hexane']}, + {'names': ['tetrazole']}, + {'names': ['diethyl ether']}, + {'names': ['toluene']}, + {'names': ['MgSO4']}, + {'names': ['1H-NMR']}, + {'names': ['CDCl3']}, + {'names': ['2H']}, + {'names': ['Ar-H']}, + { + 'melting_points': [ + {'units': '°C', 'value': '57–59'} + ], + 'nmr_spectra': [{ + 'peaks': [ + {'shift': '4.62', 'number': '2H', 'multiplicity': 's'}, + {'shift': '5.68', 'number': '2H', 'multiplicity': 's'}, + {'shift': '7.28–7.30', 'number': '2H', 'assignment': 'Ar-H', 'multiplicity': 'm'}, + {'shift': '7.39–7.40', 'number': '3H', 'assignment': 'Ar-H', 'multiplicity': 'm'} + ], + 'solvent': 'CDCl3', + 'nucleus': '1H'} + ], + 'names': ['1-Benzyl-5-(chloromethyl)-1H-tetrazole'], + 'labels': ['3'] + }, + { + 'melting_points': [ + {'units': '°C', 'value': '91–92'} + ], + 'nmr_spectra': [{ + 'peaks': [ + {'shift': '4.11', 'number': '2H', 'multiplicity': 's'}, + {'coupling': '6.0', 'number': '2H', 'shift': '4.50', 'coupling_units': 'Hz', + 'multiplicity': 'd'}, + {'shift': '6.89', 'number': '1H', 'multiplicity': 'br s'}, + {'shift': '7.26–7.36', 'number': '5H', 'assignment': 'Ar-H', 'multiplicity': 'm'} + ], + 'solvent': 'CDCl3', + 'nucleus': '1H'} + ], + 'names': ['N-Benzyl-2-chloroacetamide', 'N-benzyl-2-chloroacetamide'], # even with this two repeated names, the extractor is working ok + 'labels': ['2'] + }] + self.assertEqual(expected, d.records.serialize()) if __name__ == '__main__': diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py index 3c268b1..0805b09 100644 --- a/tests/test_parse_cem.py +++ b/tests/test_parse_cem.py @@ -403,6 +403,11 @@ def test_consecutive_headings2(self): Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.') ) results = [r.serialize() for r in d.records] + print(results) + print([ + {'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}, + {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']} + ]) self.assertEqual(results, [ {'labels': [u'VII'], 'roles': [u'formula']}, {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}],