ISSN Extractor

Xarvalus · Xarvalus · commit 46dc0aaeb3d6 · 2018-05-31T23:26:05.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -61,3 +61,4 @@ docs/_build/
 
 # Pycharm directories
 .idea
+venv/
diff --git a/README.rst b/README.rst
@@ -32,6 +32,7 @@ Documentation is provided ``$ mwcitations extract -h``.
      * DOI
      * ISBN
      * arXiv
+     * ISSN
 
     Outputs a TSV file with the following fields:
 
diff --git a/mwcites/extractors/issn.py b/mwcites/extractors/issn.py
@@ -0,0 +1,8 @@
+import re
+from ..identifier import Identifier
+
+ISSN_RE = re.compile('issn\s?=?\s?([0-9]{4}\-[0-9]{3}([0-9]|X))', re.I)
+
+def extract(text):
+    for match in ISSN_RE.finditer(text):
+        yield Identifier('issn', match.group(1).replace('-', ''))
diff --git a/mwcites/extractors/tests/test_issn.py b/mwcites/extractors/tests/test_issn.py
@@ -0,0 +1,20 @@
+import pprint
+from nose.tools import eq_
+
+from .. import issn
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+ {{cite book|work=Billboard|title=Sinatra FBI Files Opened|first=Bill|last=Holland|url=https://books.google.com/books?id=KQoEAAAAMBAJ&dq=Bill+Holland+1998+Billboard+page+10&q=walter+winchell#v=snippet&q=walter%20winchell&f=false|date=December 19, 1998|page=10|issn=0006-2510}}
+    """
+
+
+EXPECTED = [
+    Identifier('issn', '00062510'),
+]
+
+def test_extract():
+    ids = list(issn.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwcites/utilities/extract.py b/mwcites/utilities/extract.py
@@ -9,6 +9,7 @@
  * DOI
  * ISBN
  * arXiv
+ * ISSN
 
 Outputs a TSV file with the following fields:
 
@@ -41,9 +42,9 @@
 
 import mysqltsv
 
-from ..extractors import arxiv, doi, isbn, pubmed
+from ..extractors import arxiv, doi, isbn, pubmed, issn
 
-ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv, issn]
 
 HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
 
@@ -60,11 +61,11 @@ def main(argv=None):
     run(dump_files, extractors)
 
 def run(dump_files, extractors):
-    writer = mysqltsv.Writer(sts.stdout, headers=HEADERS)
+    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)
 
     cites = extract(dump_files, extractors=extractors)
     for page_id, title, rev_id, timestamp, type, id in cites:
-        writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
+        writer.write([page_id, title, rev_id, timestamp.long_format(), type, id])
 
 def extract(dump_files, extractors=ALL_EXTRACTORS):
     """
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ docopt
 more-itertools
 mwparserfromhell
 mwxml
+mysqltsv

Original file line number	Diff line number	Diff line change
`@@ -61,3 +61,4 @@ docs/_build/`
`61`	`61`
`62`	`62`	`# Pycharm directories`
`63`	`63`	`.idea`
	`64`	`+venv/`
-Original file line number
+Diff line change
      * DOI
      * ISBN
      * arXiv
 +     * ISSN
     Outputs a TSV file with the following fields: