Merge pull request #105 from wflynny/robust-doi

[Fix #95] robust handling of DOIs
main
Olivier Mangin 7 years ago committed by GitHub
commit 14440c5e4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,4 @@
import argparse
from ..uis import get_ui from ..uis import get_ui
from .. import bibstruct from .. import bibstruct
from .. import content from .. import content
@ -7,13 +8,21 @@ from .. import templates
from .. import apis from .. import apis
from .. import color from .. import color
from .. import pretty from .. import pretty
from .. import utils
class ValidateDOI(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
doi = values
new_doi = utils.standardize_doi(doi)
setattr(namespace, self.dest, new_doi)
def parser(subparsers, conf): def parser(subparsers, conf):
parser = subparsers.add_parser('add', help='add a paper to the repository') parser = subparsers.add_parser('add', help='add a paper to the repository')
parser.add_argument('bibfile', nargs='?', default=None, parser.add_argument('bibfile', nargs='?', default=None,
help='bibtex file') help='bibtex file')
parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None) parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI)
parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None) parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None)
parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None) parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None)
parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas', parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas',

@ -1,28 +1,34 @@
# Function here may belong somewhere else. In the mean time... # Function here may belong somewhere else. In the mean time...
import re
from . import color from . import color
from . import pretty from . import pretty
def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True): def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True):
"""Check that a citekey exists, or autocompletes it if not ambiguous.""" """Check that a citekey exists, or autocompletes it if not ambiguous.
""" :returns found citekey """ :returns found citekey
"""
# FIXME. Make me optionally non ui interactive/exiting # FIXME. Make me optionally non ui interactive/exiting
citekeys = repo.citekeys_from_prefix(citekey) citekeys = repo.citekeys_from_prefix(citekey)
if len(citekeys) == 0: if len(citekeys) == 0:
if ui is not None: if ui is not None:
ui.error("No citekey named or beginning with '{}'".format(color.dye_out(citekey, 'citekey'))) ui.error("No citekey named or beginning with '{}'".format(
color.dye_out(citekey, 'citekey')))
if exit_on_fail: if exit_on_fail:
ui.exit() ui.exit()
elif len(citekeys) == 1: elif len(citekeys) == 1:
if citekeys[0] != citekey: if citekeys[0] != citekey:
if ui is not None: if ui is not None:
ui.info("'{}' has been autocompleted into '{}'.".format(color.dye_out(citekey, 'citekey'), color.dye_out(citekeys[0], 'citekey'))) ui.info("'{}' has been autocompleted into '{}'.".format(
color.dye_out(citekey, 'citekey'),
color.dye_out(citekeys[0], 'citekey')))
citekey = citekeys[0] citekey = citekeys[0]
elif citekey not in citekeys: elif citekey not in citekeys:
if ui is not None: if ui is not None:
citekeys = sorted(citekeys) citekeys = sorted(citekeys)
ui.error("Be more specific; '{}' matches multiples citekeys:".format( ui.error("Be more specific; '{}' matches multiples "
citekey)) "citekeys:".format(citekey))
for c in citekeys: for c in citekeys:
p = repo.pull_paper(c) p = repo.pull_paper(c)
ui.message(u' {}'.format(pretty.paper_oneliner(p))) ui.message(u' {}'.format(pretty.paper_oneliner(p)))
@ -44,3 +50,37 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True):
ui.exit() ui.exit()
else: else:
return keys return keys
def standardize_doi(doi):
"""
Given a putative doi, attempts to always return it in the form of
10.XXXX/... Specifically designed to handle these cases:
- https://doi.org/<doi>
- http://doi.org/<doi>
- https://dx.doi.org/<doi>
- http://dx.doi.org/<doi>
- dx.doi.org/<doi>
- doi.org/<doi>
and attempts to verify doi adherence to DOI handbook standards and
crossref.org advice:
https://www.doi.org/doi_handbook/2_Numbering.html
https://www.crossref.org/blog/dois-and-matching-regular-expressions/
:returns standardized doi
"""
doi_regexes = (
'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)',
'(10.1002/[^\s]+)',
'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)',
'(10\.1021/\w\w\d+\+)',
'(10\.1207/[\w\d]+\&\d+_\d+)')
doi_pattern = re.compile('|'.join(doi_regexes))
match = doi_pattern.search(doi)
if not match:
raise ValueError("Not a valid doi: %s", doi)
new_doi = match.group(0)
return new_doi

@ -0,0 +1,80 @@
# coding: utf8
from __future__ import unicode_literals
import unittest
from pubs.utils import standardize_doi
class TestDOIStandardization(unittest.TestCase):
def setUp(self):
# some of these come from
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page
self.crossref_dois = (
'10.2310/JIM.0b013e31820bab4c',
'10.1007/978-3-642-28108-2_19',
'10.1016/S0735-1097(98)00347-7',
)
self.hard_dois = (
'10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2',
'10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S',
'10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2',
)
self.currently_not_supported = (
'10.1007.10/978-3-642-28108-2_19',
'10.1000.10/123456',
'10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7',
)
def test_http_dxdoi_org(self):
doi = 'http://dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_https_dxdoi_org(self):
doi = 'https://dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_http_doi_org(self):
doi = 'http://doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_https_doi_org(self):
doi = 'https://doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_doi_org(self):
doi = 'doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_dxdoi_org(self):
doi = 'dx.doi.org/10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_doi_colon(self):
doi = 'doi:10.1109/5.771073'
sdoi = standardize_doi(doi)
self.assertEqual(sdoi, '10.1109/5.771073')
def test_crossref_dois(self):
for doi in self.crossref_dois:
sdoi = standardize_doi(doi)
self.assertEqual(doi, sdoi)
def test_hard_dois(self):
for doi in self.hard_dois:
sdoi = standardize_doi(doi)
self.assertEqual(doi, sdoi)
def test_currently_not_supported(self):
for doi in self.currently_not_supported:
with self.assertRaises(ValueError):
standardize_doi(doi)
Loading…
Cancel
Save