[Fix #95] robust handling of DOIs

Added DOI regex function to utils.py which is called in add_cmd.py
upon specifying a new DOI.  DOI validation applies directly on
argument parsing by using a custom argparse.Action.
main
Bill Flynn 7 years ago
parent 28a026d308
commit e2ad39ca08

@ -1,3 +1,4 @@
import argparse
from ..uis import get_ui
from .. import bibstruct
from .. import content
@ -7,13 +8,21 @@ from .. import templates
from .. import apis
from .. import color
from .. import pretty
from .. import utils
class ValidateDOI(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
doi = values
new_doi = utils.standardize_doi(doi)
if not new_doi:
raise ValueError("Not a valid doi: %s", doi)
setattr(namespace, self.dest, new_doi)
def parser(subparsers, conf):
parser = subparsers.add_parser('add', help='add a paper to the repository')
parser.add_argument('bibfile', nargs='?', default=None,
help='bibtex file')
parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None)
parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI)
parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None)
parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None)
parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas',

@ -1,5 +1,7 @@
# Function here may belong somewhere else. In the mean time...
import re
from . import color
from . import pretty
@ -44,3 +46,36 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True):
ui.exit()
else:
return keys
def standardize_doi(doi):
"""
Given a putative doi, attempts to always return it in the form of
10.XXXX/... Specifically designed to handle these cases:
- https://doi.org/<doi>
- http://doi.org/<doi>
- https://dx.doi.org/<doi>
- http://dx.doi.org/<doi>
- dx.doi.org/<doi>
- doi.org/<doi>
and attempts to verify doi adherence to DOI handbook standards and
crossref.org advice:
https://www.doi.org/doi_handbook/2_Numbering.html
https://www.crossref.org/blog/dois-and-matching-regular-expressions/"""
""" :returns standardized doi """
doi_regexes = (
re.compile(r'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)'),
re.compile(r'(10.1002/[^\s]+)'),
re.compile(r'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)'),
re.compile(r'(10\.1021/\w\w\d+\+)'),
re.compile(r'(10\.1207/[\w\d]+\&\d+_\d+)')
)
for doi_regex in doi_regexes:
match = doi_regex.search(doi)
if match:
new_doi = match.group(0)
break
else:
new_doi = None
return new_doi

@ -0,0 +1,80 @@
# coding: utf8
from __future__ import unicode_literals
import unittest
from pubs.p3 import ustr
from pubs.utils import standardize_doi
class TestDOIStandardization(unittest.TestCase):
def setUp(self):
# some of these come from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page
self.crossref_dois = (
'10.2310/JIM.0b013e31820bab4c',
'10.1007/978-3-642-28108-2_19',
'10.1016/S0735-1097(98)00347-7',
)
self.hard_dois = (
'10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2',
'10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S',
'10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2',
)
self.currently_not_supported = (
'10.1007.10/978-3-642-28108-2_19',
'10.1000.10/123456',
'10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7',
)
def test_http_dxdoi_org(self):
doi = 'http://dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_https_dxdoi_org(self):
doi = 'https://dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_http_doi_org(self):
doi = 'http://doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_https_doi_org(self):
doi = 'https://doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_doi_org(self):
doi = 'doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_dxdoi_org(self):
doi = 'dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_doi_colon_org(self):
doi = 'doi:10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_crossref_dois(self):
for doi in self.crossref_dois:
sdoi = standardize_doi(doi)
self.assertEqual(doi, sdoi)
def test_hard_dois(self):
for doi in self.hard_dois:
sdoi = standardize_doi(doi)
self.assertEqual(doi, sdoi)
def test_currently_not_supported(self):
for doi in self.currently_not_supported:
sdoi = standardize_doi(doi)
self.assertIs(sdoi, None)
Loading…
Cancel
Save