Skip to content

Commit 46dc0aa

Browse files
committed
ISSN Extractor
1 parent 72c4e6d commit 46dc0aa

File tree

6 files changed

+36
-4
lines changed

6 files changed

+36
-4
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,4 @@ docs/_build/
6161

6262
# Pycharm directories
6363
.idea
64+
venv/

README.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Documentation is provided ``$ mwcitations extract -h``.
3232
* DOI
3333
* ISBN
3434
* arXiv
35+
* ISSN
3536

3637
Outputs a TSV file with the following fields:
3738

mwcites/extractors/issn.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import re
2+
from ..identifier import Identifier
3+
4+
ISSN_RE = re.compile('issn\s?=?\s?([0-9]{4}\-[0-9]{3}([0-9]|X))', re.I)
5+
6+
def extract(text):
7+
for match in ISSN_RE.finditer(text):
8+
yield Identifier('issn', match.group(1).replace('-', ''))

mwcites/extractors/tests/test_issn.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import pprint
2+
from nose.tools import eq_
3+
4+
from .. import issn
5+
from ...identifier import Identifier
6+
7+
INPUT_TEXT = """
8+
{{cite book|work=Billboard|title=Sinatra FBI Files Opened|first=Bill|last=Holland|url=https://books.google.com/books?id=KQoEAAAAMBAJ&dq=Bill+Holland+1998+Billboard+page+10&q=walter+winchell#v=snippet&q=walter%20winchell&f=false|date=December 19, 1998|page=10|issn=0006-2510}}
9+
"""
10+
11+
12+
EXPECTED = [
13+
Identifier('issn', '00062510'),
14+
]
15+
16+
def test_extract():
17+
ids = list(issn.extract(INPUT_TEXT))
18+
pprint.pprint(ids)
19+
pprint.pprint(EXPECTED)
20+
eq_(ids, EXPECTED)

mwcites/utilities/extract.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* DOI
1010
* ISBN
1111
* arXiv
12+
* ISSN
1213
1314
Outputs a TSV file with the following fields:
1415
@@ -41,9 +42,9 @@
4142

4243
import mysqltsv
4344

44-
from ..extractors import arxiv, doi, isbn, pubmed
45+
from ..extractors import arxiv, doi, isbn, pubmed, issn
4546

46-
ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
47+
ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv, issn]
4748

4849
HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
4950

@@ -60,11 +61,11 @@ def main(argv=None):
6061
run(dump_files, extractors)
6162

6263
def run(dump_files, extractors):
63-
writer = mysqltsv.Writer(sts.stdout, headers=HEADERS)
64+
writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)
6465

6566
cites = extract(dump_files, extractors=extractors)
6667
for page_id, title, rev_id, timestamp, type, id in cites:
67-
writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
68+
writer.write([page_id, title, rev_id, timestamp.long_format(), type, id])
6869

6970
def extract(dump_files, extractors=ALL_EXTRACTORS):
7071
"""

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ docopt
22
more-itertools
33
mwparserfromhell
44
mwxml
5+
mysqltsv

0 commit comments

Comments
 (0)