Merge pull request #105 from wflynny/robust-doi
[Fix #95] robust handling of DOIs
This commit is contained in:
commit
14440c5e4c
@ -1,3 +1,4 @@
|
|||||||
|
import argparse
|
||||||
from ..uis import get_ui
|
from ..uis import get_ui
|
||||||
from .. import bibstruct
|
from .. import bibstruct
|
||||||
from .. import content
|
from .. import content
|
||||||
@ -7,13 +8,21 @@ from .. import templates
|
|||||||
from .. import apis
|
from .. import apis
|
||||||
from .. import color
|
from .. import color
|
||||||
from .. import pretty
|
from .. import pretty
|
||||||
|
from .. import utils
|
||||||
|
|
||||||
|
|
||||||
|
class ValidateDOI(argparse.Action):
|
||||||
|
def __call__(self, parser, namespace, values, option_string=None):
|
||||||
|
doi = values
|
||||||
|
new_doi = utils.standardize_doi(doi)
|
||||||
|
setattr(namespace, self.dest, new_doi)
|
||||||
|
|
||||||
|
|
||||||
def parser(subparsers, conf):
|
def parser(subparsers, conf):
|
||||||
parser = subparsers.add_parser('add', help='add a paper to the repository')
|
parser = subparsers.add_parser('add', help='add a paper to the repository')
|
||||||
parser.add_argument('bibfile', nargs='?', default=None,
|
parser.add_argument('bibfile', nargs='?', default=None,
|
||||||
help='bibtex file')
|
help='bibtex file')
|
||||||
parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None)
|
parser.add_argument('-D', '--doi', help='doi number to retrieve the bibtex entry, if it is not provided', default=None, action=ValidateDOI)
|
||||||
parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None)
|
parser.add_argument('-I', '--isbn', help='isbn number to retrieve the bibtex entry, if it is not provided', default=None)
|
||||||
parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None)
|
parser.add_argument('-d', '--docfile', help='pdf or ps file', default=None)
|
||||||
parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas',
|
parser.add_argument('-t', '--tags', help='tags associated to the paper, separated by commas',
|
||||||
@ -21,9 +30,9 @@ def parser(subparsers, conf):
|
|||||||
parser.add_argument('-k', '--citekey', help='citekey associated with the paper;\nif not provided, one will be generated automatically.',
|
parser.add_argument('-k', '--citekey', help='citekey associated with the paper;\nif not provided, one will be generated automatically.',
|
||||||
default=None)
|
default=None)
|
||||||
parser.add_argument('-L', '--link', action='store_false', dest='copy', default=True,
|
parser.add_argument('-L', '--link', action='store_false', dest='copy', default=True,
|
||||||
help="don't copy document files, just create a link.")
|
help="don't copy document files, just create a link.")
|
||||||
parser.add_argument('-M', '--move', action='store_true', dest='move', default=False,
|
parser.add_argument('-M', '--move', action='store_true', dest='move', default=False,
|
||||||
help="move document instead of of copying (ignored if --link).")
|
help="move document instead of of copying (ignored if --link).")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,28 +1,34 @@
|
|||||||
# Function here may belong somewhere else. In the mean time...
|
# Function here may belong somewhere else. In the mean time...
|
||||||
|
import re
|
||||||
|
|
||||||
from . import color
|
from . import color
|
||||||
from . import pretty
|
from . import pretty
|
||||||
|
|
||||||
|
|
||||||
def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True):
|
def resolve_citekey(repo, citekey, ui=None, exit_on_fail=True):
|
||||||
"""Check that a citekey exists, or autocompletes it if not ambiguous."""
|
"""Check that a citekey exists, or autocompletes it if not ambiguous.
|
||||||
""" :returns found citekey """
|
:returns found citekey
|
||||||
|
"""
|
||||||
# FIXME. Make me optionally non ui interactive/exiting
|
# FIXME. Make me optionally non ui interactive/exiting
|
||||||
citekeys = repo.citekeys_from_prefix(citekey)
|
citekeys = repo.citekeys_from_prefix(citekey)
|
||||||
if len(citekeys) == 0:
|
if len(citekeys) == 0:
|
||||||
if ui is not None:
|
if ui is not None:
|
||||||
ui.error("No citekey named or beginning with '{}'".format(color.dye_out(citekey, 'citekey')))
|
ui.error("No citekey named or beginning with '{}'".format(
|
||||||
|
color.dye_out(citekey, 'citekey')))
|
||||||
if exit_on_fail:
|
if exit_on_fail:
|
||||||
ui.exit()
|
ui.exit()
|
||||||
elif len(citekeys) == 1:
|
elif len(citekeys) == 1:
|
||||||
if citekeys[0] != citekey:
|
if citekeys[0] != citekey:
|
||||||
if ui is not None:
|
if ui is not None:
|
||||||
ui.info("'{}' has been autocompleted into '{}'.".format(color.dye_out(citekey, 'citekey'), color.dye_out(citekeys[0], 'citekey')))
|
ui.info("'{}' has been autocompleted into '{}'.".format(
|
||||||
|
color.dye_out(citekey, 'citekey'),
|
||||||
|
color.dye_out(citekeys[0], 'citekey')))
|
||||||
citekey = citekeys[0]
|
citekey = citekeys[0]
|
||||||
elif citekey not in citekeys:
|
elif citekey not in citekeys:
|
||||||
if ui is not None:
|
if ui is not None:
|
||||||
citekeys = sorted(citekeys)
|
citekeys = sorted(citekeys)
|
||||||
ui.error("Be more specific; '{}' matches multiples citekeys:".format(
|
ui.error("Be more specific; '{}' matches multiples "
|
||||||
citekey))
|
"citekeys:".format(citekey))
|
||||||
for c in citekeys:
|
for c in citekeys:
|
||||||
p = repo.pull_paper(c)
|
p = repo.pull_paper(c)
|
||||||
ui.message(u' {}'.format(pretty.paper_oneliner(p)))
|
ui.message(u' {}'.format(pretty.paper_oneliner(p)))
|
||||||
@ -44,3 +50,37 @@ def resolve_citekey_list(repo, citekeys, ui=None, exit_on_fail=True):
|
|||||||
ui.exit()
|
ui.exit()
|
||||||
else:
|
else:
|
||||||
return keys
|
return keys
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_doi(doi):
|
||||||
|
"""
|
||||||
|
Given a putative doi, attempts to always return it in the form of
|
||||||
|
10.XXXX/... Specifically designed to handle these cases:
|
||||||
|
- https://doi.org/<doi>
|
||||||
|
- http://doi.org/<doi>
|
||||||
|
- https://dx.doi.org/<doi>
|
||||||
|
- http://dx.doi.org/<doi>
|
||||||
|
- dx.doi.org/<doi>
|
||||||
|
- doi.org/<doi>
|
||||||
|
and attempts to verify doi adherence to DOI handbook standards and
|
||||||
|
crossref.org advice:
|
||||||
|
https://www.doi.org/doi_handbook/2_Numbering.html
|
||||||
|
https://www.crossref.org/blog/dois-and-matching-regular-expressions/
|
||||||
|
|
||||||
|
:returns standardized doi
|
||||||
|
"""
|
||||||
|
|
||||||
|
doi_regexes = (
|
||||||
|
'(10\.\d{4,9}/[-._;()/:A-z0-9\>\<]+)',
|
||||||
|
'(10.1002/[^\s]+)',
|
||||||
|
'(10\.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d)',
|
||||||
|
'(10\.1021/\w\w\d+\+)',
|
||||||
|
'(10\.1207/[\w\d]+\&\d+_\d+)')
|
||||||
|
doi_pattern = re.compile('|'.join(doi_regexes))
|
||||||
|
|
||||||
|
match = doi_pattern.search(doi)
|
||||||
|
if not match:
|
||||||
|
raise ValueError("Not a valid doi: %s", doi)
|
||||||
|
new_doi = match.group(0)
|
||||||
|
|
||||||
|
return new_doi
|
||||||
|
80
tests/test_doi.py
Normal file
80
tests/test_doi.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
# coding: utf8
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from pubs.utils import standardize_doi
|
||||||
|
|
||||||
|
|
||||||
|
class TestDOIStandardization(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
# some of these come from
|
||||||
|
# https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page
|
||||||
|
self.crossref_dois = (
|
||||||
|
'10.2310/JIM.0b013e31820bab4c',
|
||||||
|
'10.1007/978-3-642-28108-2_19',
|
||||||
|
'10.1016/S0735-1097(98)00347-7',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.hard_dois = (
|
||||||
|
'10.1175/1520-0485(2002)032<0870:CT>2.0.CO;2',
|
||||||
|
'10.1002/(SICI)1522-2594(199911)42:5<952::AID-MRM16>3.0.CO;2-S',
|
||||||
|
'10.1579/0044-7447(2006)35\[89:RDUICP\]2.0.CO;2',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.currently_not_supported = (
|
||||||
|
'10.1007.10/978-3-642-28108-2_19',
|
||||||
|
'10.1000.10/123456',
|
||||||
|
'10.1016.12.31/nature.S0735-1097(98)2000/12/31/34:7-7',
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_http_dxdoi_org(self):
|
||||||
|
doi = 'http://dx.doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_https_dxdoi_org(self):
|
||||||
|
doi = 'https://dx.doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_http_doi_org(self):
|
||||||
|
doi = 'http://doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_https_doi_org(self):
|
||||||
|
doi = 'https://doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_doi_org(self):
|
||||||
|
doi = 'doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_dxdoi_org(self):
|
||||||
|
doi = 'dx.doi.org/10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_doi_colon(self):
|
||||||
|
doi = 'doi:10.1109/5.771073'
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(sdoi, '10.1109/5.771073')
|
||||||
|
|
||||||
|
def test_crossref_dois(self):
|
||||||
|
for doi in self.crossref_dois:
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(doi, sdoi)
|
||||||
|
|
||||||
|
def test_hard_dois(self):
|
||||||
|
for doi in self.hard_dois:
|
||||||
|
sdoi = standardize_doi(doi)
|
||||||
|
self.assertEqual(doi, sdoi)
|
||||||
|
|
||||||
|
def test_currently_not_supported(self):
|
||||||
|
for doi in self.currently_not_supported:
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
standardize_doi(doi)
|
Loading…
x
Reference in New Issue
Block a user