From be253f9084fd4ca5f9fad0075cc15cdc6e78f4b1 Mon Sep 17 00:00:00 2001 From: "Fabien C. Y. Benureau" Date: Sun, 5 Aug 2018 21:33:25 +0900 Subject: [PATCH] handling of arxiv errors --- pubs/apis.py | 157 +++++++++++++++++++++++++++++++++++++-------- setup.py | 2 +- tests/test_apis.py | 26 ++++++-- 3 files changed, 154 insertions(+), 31 deletions(-) diff --git a/pubs/apis.py b/pubs/apis.py index 014217f..5c30bf6 100644 --- a/pubs/apis.py +++ b/pubs/apis.py @@ -1,4 +1,6 @@ """Interface for Remote Bibliographic APIs""" +import re +import datetime import requests import bibtexparser @@ -7,7 +9,7 @@ import feedparser from bs4 import BeautifulSoup -class ReferenceNotFoundException(Exception): +class ReferenceNotFoundError(Exception): pass @@ -50,53 +52,156 @@ def get_bibentry_from_api(id_str, id_type, rp): return bibentry + +def _get_request(url, headers=None): + """GET requests to a url. Return the `requests` object. + + :raise ConnectionError: if anything goes bad (connection refused, timeout + http status error (401, 404, etc)). + """ + try: + r = requests.get(url, headers=headers) + r.raise_for_status() + return r + except requests.exceptions.RequestException as e: + raise ReferenceNotFoundError(e.args) + + + ## DOI support + def doi2bibtex(doi): """Return a bibtex string of metadata from a DOI""" url = 'http://dx.doi.org/{}'.format(doi) headers = {'accept': 'application/x-bibtex'} - r = requests.get(url, headers=headers) + r = _get_request(url, headers=headers) if r.encoding is None: r.encoding = 'utf8' # Do not rely on guessing from request return r.text + ## ISBN support + def isbn2bibtex(isbn): """Return a bibtex string of metadata from an ISBN""" url = 'http://www.ottobib.com/isbn/{}/bibtex'.format(isbn) - r = requests.get(url) + r = _get_request(url) soup = BeautifulSoup(r.text, "html.parser") citation = soup.find("textarea").text return citation + # Note: apparently ottobib.com uses caracter modifiers for accents instead + # of the correct unicode characters. TODO: Should we convert them? + + + ## arXiv support -def arxiv2bibtex(arxiv_id): - """Return a bibtex string of metadata from an arXiv ID""" +_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] - url = 'https://export.arxiv.org/api/query?id_list=' + arxiv_id - r = requests.get(url) +def _is_arxiv_oldstyle(arxiv_id): + return re.match(r"(arXiv\:)?[a-z\-]+\/[0-9]+(v[0-9]+)?", arxiv_id) is not None + +def _extract_arxiv_id(entry): + pattern = r"http[s]?://arxiv.org/abs/(?P.+)" + return re.search(pattern, entry['id']).groupdict()['entry_id'] + + +def arxiv2bibtex(arxiv_id, try_doi=True, ui=None): + """Return a bibtex string of metadata from an arXiv ID + + :param arxiv_id: arXiv id, with or without the `arXiv:` prefix and version + suffix (e.g. `v1`). Old an new style are accepted. Here are + example of accepted identifiers: `1510.00322`, + `arXiv:1510.00322`, `0901.0512`, `arXiv:0901.0512`, + `hep-ph/9409201` or `arXiv:hep-ph/9409201`. + Note that the `arXiv:` prefix will be automatically + removed, and the version suffix automatically added if + missing. + :param try_doi: if a DOI is referenced in the arXiv metadata, + try to download it instead. If that fails for any reason, + falls back to the arXiv, with a warning message, if the + UI is provided. + :param ui: if not None, will display a warning if the doi request + fails. + """ + ## handle errors + url = 'https://export.arxiv.org/api/query?id_list={}'.format(arxiv_id) + try: + r = requests.get(url) + if r.status_code == 400: # bad request + msg = ("the arXiv server returned a bad request error. The " + "arXiv id {} is possibly invalid or malformed.".format(arxiv_id)) + raise ReferenceNotFoundError(msg) + r.raise_for_status() # raise an exception for HTTP errors: + # 401, 404, 400 if `ui` is None, etc. + except requests.exceptions.RequestException as e: + msg = ("connection error while retrieving arXiv data for " + "'{}': {}".format(arxiv_id, e)) + raise ReferenceNotFoundError(msg) + + # print("TEXT = '{}'".format(r.text)) feed = feedparser.parse(r.text) - entry = feed.entries[0] + if len(feed.entries) == 0: # no results. + msg = "no results for arXiv id {}".format(arxiv_id) + raise ReferenceNotFoundError(msg) + if len(feed.entries) > 1: # I don't know how that could happen, but let's + # be ready for it. + results = '\n'.join('{}. {}'.format(i, entry['title']) + for entry in feed.entries) + msg = ("multiple results for arXiv id {}:\n{}\nThis is unexpected. " + "Please submit an issue at " + "https://github.com/pubs/pubs/issues").format(arxiv_id, choices) + raise ReferenceNotFoundError(msg) - if 'title' not in entry: - raise ReferenceNotFoundException('arXiv ID not found.') - elif 'arxiv_doi' in entry: - bibtex = doi2bibtex(entry['arxiv_doi']) - else: - # Create a bibentry from the metadata. - db = BibDatabase() - author_str = ' and '.join( - [author['name'] for author in entry['authors']]) - db.entries = [{ - 'ENTRYTYPE': 'article', - 'ID': arxiv_id, - 'author': author_str, - 'title': entry['title'], - 'year': str(entry['published_parsed'].tm_year), - 'Eprint': arxiv_id, - }] - bibtex = bibtexparser.dumps(db) + entry = feed.entries[0] + if 'arxiv.org/api/errors' in entry['id']: # server is returning an error message. + msg = 'the arXiv server returned an error message: {}'.format(entry['summary']) + raise ReferenceNotFoundError(msg) + # import pprint + # pprint.pprint(entry) + + + ## try to return a doi instead of the arXiv reference + if try_doi and 'arxiv_doi' in entry: + try: + return doi2bibtex(entry['arxiv_doi']) + except ReferenceNotFoundError as e: + if ui is not None: + ui.warning(str(e)) + + ## create a bibentry from the arXiv response. + db = BibDatabase() + entry_id = _extract_arxiv_id(entry) + author_str = ' and '.join( + [author['name'] for author in entry['authors']]) + db.entries = [{ + 'ENTRYTYPE': 'article', + 'ID': entry_id, + 'author': author_str, + 'title': entry['title'], + 'year': str(entry['published_parsed'].tm_year), + 'month': _months[entry['published_parsed'].tm_mon-1], + 'eprint': entry_id, + 'eprinttype': 'arxiv', + 'date': entry['published'], # not really standard, but a resolution more + # granular than months is increasinlgy relevant. + 'url': entry['link'], + 'urldate': datetime.datetime.utcnow().isoformat(timespec='seconds') + 'Z' # can't hurt. + }] + # we don't add eprintclass for old-style ids, as it is in the id already. + if not _is_arxiv_oldstyle(entry_id): + db.entries[0]['eprintclass'] = entry['arxiv_primary_category']['term'] + if 'arxiv_doi' in entry: + db.entries[0]['arxiv_doi'] = arxiv_doi + + bibtex = bibtexparser.dumps(db) return bibtex + +if __name__ == '__main__': + print(arxiv2bibtex("0704.0010")) + print(arxiv2bibtex("0704.010*")) +# print(arxiv2bibtex("quant-ph/0703266")) diff --git a/setup.py b/setup.py index 9e99694..e1fd8a9 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ setup( }, install_requires=['pyyaml', 'bibtexparser>=1.0', 'python-dateutil', - 'requests', 'configobj', 'beautifulsoup4'], + 'requests', 'configobj', 'beautifulsoup4', 'feedparser'], tests_require=['pyfakefs>=2.7', 'mock'], extras_require={'autocompletion': ['argcomplete'], }, diff --git a/tests/test_apis.py b/tests/test_apis.py index c2893cc..2df5289 100644 --- a/tests/test_apis.py +++ b/tests/test_apis.py @@ -7,7 +7,8 @@ import dotdot from pubs.p3 import ustr from pubs.endecoder import EnDecoder -from pubs.apis import arxiv2bibtex, doi2bibtex, isbn2bibtex +from pubs.apis import arxiv2bibtex, doi2bibtex, isbn2bibtex, _is_arxiv_oldstyle, _extract_arxiv_id + class TestDOI2Bibtex(unittest.TestCase): @@ -84,6 +85,23 @@ class TestArxiv2Bibtex(unittest.TestCase): entry['title'], 'The entropy formula for the Ricci flow and its geometric applications') - -# Note: apparently ottobib.com uses caracter modifiers for accents instead -# of the correct unicode characters. TODO: Should we convert them? + def test_oldstyle_pattern(self): + """Test that we can accurately differentiate between old and new style arXiv ids.""" + # old-style arXiv ids + for arxiv_id in ['cs/9301113', 'math/9201277v3', 'astro-ph/9812133', + 'cond-mat/0604612', 'hep-ph/0702007v10', 'arXiv:physics/9403001' + ]: + self.assertTrue(_is_arxiv_oldstyle(arxiv_id)) + # new-style arXiv ids + for arxiv_id in ['1808.00954', 'arXiv:1808.00953', '1808.0953', + '1808.00954v1', 'arXiv:1808.00953v2', '1808.0953v42']: + self.assertFalse(_is_arxiv_oldstyle(arxiv_id)) + + def test_extract_id(self): + """Test that ids are correctly extracted""" + self.assertEqual(_extract_arxiv_id({'id': "http://arxiv.org/abs/0704.0010v1"}), "0704.0010v1") + self.assertEqual(_extract_arxiv_id({'id': "https://arxiv.org/abs/0704.0010v1"}), "0704.0010v1") + self.assertEqual(_extract_arxiv_id({'id': "https://arxiv.org/abs/astro-ph/9812133v2"}), "astro-ph/9812133v2") + +if __name__ == '__main__': + unittest.main(verbosity=2)