diff --git a/pubs/commands/add_cmd.py b/pubs/commands/add_cmd.py index 8c5db77..a425434 100644 --- a/pubs/commands/add_cmd.py +++ b/pubs/commands/add_cmd.py @@ -1,3 +1,4 @@ +import argparse from ..uis import get_ui from .. import bibstruct from .. import content @@ -7,13 +8,21 @@ from .. import templates from .. import apis from .. import color from .. import pretty +from .. import utils + + +class ValidateDOI(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + doi = values + new_doi = utils.standardize_doi(doi) + setattr(namespace, self.dest, new_doi) def parser(subparsers, conf): parser = subparsers.add_parser('add', help='add a paper to the repository') parser.add_argument('bibfile', nargs='?', default=None, help='bibtex file') - parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None) + parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI) parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None) parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None) parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas', @@ -21,9 +30,9 @@ def parser(subparsers, conf): parser.add_argument('-k', '--citekey', help='citekey associated with the paper;\nif not provided, one will be generated automatically.', default=None) parser.add_argument('-L', '--link', action='store_false', dest='copy', default=True, - help="don't copy document files, just create a link.") + help="don't copy document files, just create a link.") parser.add_argument('-M', '--move', action='store_true', dest='move', default=False, - help="move document instead of of copying (ignored if --link).") + help="move document instead of of copying (ignored if --link).") return parser diff --git a/pubs/utils.py b/pubs/utils.py index 2af0fa2..ba35da5 100644 --- a/pubs/utils.py +++ b/pubs/utils.py @@ -1,28 +1,34 @@ # Function here may belong somewhere else. In the mean time... +import re from . import color from . import pretty + def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True): - """Check that a citekey exists, or autocompletes it if not ambiguous.""" - """ :returns found citekey """ + """Check that a citekey exists, or autocompletes it if not ambiguous. + :returns found citekey + """ # FIXME. Make me optionally non ui interactive/exiting citekeys = repo.citekeys_from_prefix(citekey) if len(citekeys) == 0: if ui is not None: - ui.error("No citekey named or beginning with '{}'".format(color.dye_out(citekey, 'citekey'))) + ui.error("No citekey named or beginning with '{}'".format( + color.dye_out(citekey, 'citekey'))) if exit_on_fail: ui.exit() elif len(citekeys) == 1: if citekeys[0] != citekey: if ui is not None: - ui.info("'{}' has been autocompleted into '{}'.".format(color.dye_out(citekey, 'citekey'), color.dye_out(citekeys[0], 'citekey'))) + ui.info("'{}' has been autocompleted into '{}'.".format( + color.dye_out(citekey, 'citekey'), + color.dye_out(citekeys[0], 'citekey'))) citekey = citekeys[0] elif citekey not in citekeys: if ui is not None: citekeys = sorted(citekeys) - ui.error("Be more specific; '{}' matches multiples citekeys:".format( - citekey)) + ui.error("Be more specific; '{}' matches multiples " + "citekeys:".format(citekey)) for c in citekeys: p = repo.pull_paper(c) ui.message(u' {}'.format(pretty.paper_oneliner(p))) @@ -44,3 +50,37 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True): ui.exit() else: return keys + + +def standardize_doi(doi): + """ + Given a putative doi, attempts to always return it in the form of + 10.XXXX/... Specifically designed to handle these cases: + - https://doi.org/ + - http://doi.org/ + - https://dx.doi.org/ + - http://dx.doi.org/ + - dx.doi.org/ + - doi.org/ + and attempts to verify doi adherence to DOI handbook standards and + crossref.org advice: + https://www.doi.org/doi_handbook/2_Numbering.html + https://www.crossref.org/blog/dois-and-matching-regular-expressions/ + + :returns standardized doi + """ + + doi_regexes = ( + '(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)', + '(10.1002/[^\s]+)', + '(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)', + '(10\.1021/\w\w\d+\+)', + '(10\.1207/[\w\d]+\&\d+_\d+)') + doi_pattern = re.compile('|'.join(doi_regexes)) + + match = doi_pattern.search(doi) + if not match: + raise ValueError("Not a valid doi: %s", doi) + new_doi = match.group(0) + + return new_doi diff --git a/tests/test_doi.py b/tests/test_doi.py new file mode 100644 index 0000000..e694973 --- /dev/null +++ b/tests/test_doi.py @@ -0,0 +1,80 @@ +# coding: utf8 + +from __future__ import unicode_literals +import unittest + +from pubs.utils import standardize_doi + + +class TestDOIStandardization(unittest.TestCase): + + def setUp(self): + # some of these come from + # https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page + self.crossref_dois = ( + '10.2310/JIM.0b013e31820bab4c', + '10.1007/978-3-642-28108-2_19', + '10.1016/S0735-1097(98)00347-7', + ) + + self.hard_dois = ( + '10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2', + '10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S', + '10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2', + ) + + self.currently_not_supported = ( + '10.1007.10/978-3-642-28108-2_19', + '10.1000.10/123456', + '10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7', + ) + + def test_http_dxdoi_org(self): + doi = 'http://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_dxdoi_org(self): + doi = 'https://dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_http_doi_org(self): + doi = 'http://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_https_doi_org(self): + doi = 'https://doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_org(self): + doi = 'doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_dxdoi_org(self): + doi = 'dx.doi.org/10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_doi_colon(self): + doi = 'doi:10.1109/5.771073' + sdoi = standardize_doi(doi) + self.assertEqual(sdoi, '10.1109/5.771073') + + def test_crossref_dois(self): + for doi in self.crossref_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_hard_dois(self): + for doi in self.hard_dois: + sdoi = standardize_doi(doi) + self.assertEqual(doi, sdoi) + + def test_currently_not_supported(self): + for doi in self.currently_not_supported: + with self.assertRaises(ValueError): + standardize_doi(doi)