diff --git a/pubs/commands/add_cmd.py b/pubs/commands/add_cmd.py index 8c5db77..f17a662 100644 --- a/pubs/commands/add_cmd.py +++ b/pubs/commands/add_cmd.py @@ -1,3 +1,4 @@ +import argparse from ..uis import get_ui from .. import bibstruct from .. import content @@ -7,13 +8,21 @@ from .. import templates from .. import apis from .. import color from .. import pretty +from .. import utils +class ValidateDOI(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + doi = values + new_doi = utils.standardize_doi(doi) + if not new_doi: + raise ValueError("Not a valid doi: %s", doi) + setattr(namespace, self.dest, new_doi) def parser(subparsers, conf): parser = subparsers.add_parser('add', help='add a paper to the repository') parser.add_argument('bibfile', nargs='?', default=None, help='bibtex file') - parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None) + parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI) parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None) parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None) parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas', diff --git a/pubs/utils.py b/pubs/utils.py index 2af0fa2..bbfeab5 100644 --- a/pubs/utils.py +++ b/pubs/utils.py @@ -1,5 +1,7 @@ # Function here may belong somewhere else. In the mean time... +import re + from . import color from . import pretty @@ -44,3 +46,36 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True): ui.exit() else: return keys + +def standardize_doi(doi): + """ + Given a putative doi, attempts to always return it in the form of + 10.XXXX/... Specifically designed to handle these cases: + - https://doi.org/ + - http://doi.org/ + - https://dx.doi.org/ + - http://dx.doi.org/ + - dx.doi.org/ + - doi.org/ + and attempts to verify doi adherence to DOI handbook standards and + crossref.org advice: + https://www.doi.org/doi_handbook/2_Numbering.html + https://www.crossref.org/blog/dois-and-matching-regular-expressions/""" + """ :returns standardized doi """ + doi_regexes = ( + re.compile(r'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)'), + re.compile(r'(10.1002/[^\s]+)'), + re.compile(r'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)'), + re.compile(r'(10\.1021/\w\w\d+\+)'), + re.compile(r'(10\.1207/[\w\d]+\&\d+_\d+)') + ) + + for doi_regex in doi_regexes: + match = doi_regex.search(doi) + if match: + new_doi = match.group(0) + break + else: + new_doi = None + + return new_doi diff --git a/tests/test_doi.py b/tests/test_doi.py new file mode 100644 index 0000000..4b345a6 --- /dev/null +++ b/tests/test_doi.py @@ -0,0 +1,80 @@ +# coding: utf8 + +from __future__ import unicode_literals +import unittest + +from pubs.p3 import ustr +from pubs.utils import standardize_doi + +class TestDOIStandardization(unittest.TestCase): + + def setUp(self): + # some of these come from + # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page + self.crossref_dois = ( + '10.2310/JIM.0b013e31820bab4c', + '10.1007/978-3-642-28108-2_19', + '10.1016/S0735-1097(98)00347-7', + ) + + self.hard_dois = ( + '10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2', + '10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S', + '10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2', + ) + + self.currently_not_supported = ( + '10.1007.10/978-3-642-28108-2_19', + '10.1000.10/123456', + '10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7', + ) + + def test_http_dxdoi_org(self): + doi = 'http://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_dxdoi_org(self): + doi = 'https://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_http_doi_org(self): + doi = 'http://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_doi_org(self): + doi = 'https://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_org(self): + doi = 'doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_dxdoi_org(self): + doi = 'dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_colon_org(self): + doi = 'doi:10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_crossref_dois(self): + for doi in self.crossref_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_hard_dois(self): + for doi in self.hard_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_currently_not_supported(self): + for doi in self.currently_not_supported: + sdoi = standardize_doi(doi) + self.assertIs(sdoi, None)