From be253f9084fd4ca5f9fad0075cc15cdc6e78f4b1 Mon Sep 17 00:00:00 2001
From: "Fabien C. Y. Benureau" <fabien@benureau.com>
Date: Sun, 5 Aug 2018 21:33:25 +0900
Subject: [PATCH] handling of arxiv errors

---
 pubs/apis.py       | 157 +++++++++++++++++++++++++++++++++++++--------
 setup.py           |   2 +-
 tests/test_apis.py |  24 ++++++-
 3 files changed, 153 insertions(+), 30 deletions(-)

diff --git a/pubs/apis.py b/pubs/apis.py
index 014217f..5c30bf6 100644
--- a/pubs/apis.py
+++ b/pubs/apis.py
@@ -1,4 +1,6 @@
 """Interface for Remote Bibliographic APIs"""
+import re
+import datetime
 
 import requests
 import bibtexparser
@@ -7,7 +9,7 @@ import feedparser
 from bs4 import BeautifulSoup
 
 
-class ReferenceNotFoundException(Exception):
+class ReferenceNotFoundError(Exception):
     pass
 
 
@@ -50,53 +52,156 @@ def get_bibentry_from_api(id_str, id_type, rp):
     return bibentry
 
 
+
+def _get_request(url, headers=None):
+    """GET requests to a url. Return the `requests` object.
+
+    :raise ConnectionError:  if anything goes bad (connection refused, timeout
+                             http status error (401, 404, etc)).
+    """
+    try:
+        r = requests.get(url, headers=headers)
+        r.raise_for_status()
+        return r
+    except requests.exceptions.RequestException as e:
+        raise ReferenceNotFoundError(e.args)
+
+
+    ## DOI support
+
 def doi2bibtex(doi):
     """Return a bibtex string of metadata from a DOI"""
 
     url = 'http://dx.doi.org/{}'.format(doi)
     headers = {'accept': 'application/x-bibtex'}
-    r = requests.get(url, headers=headers)
+    r = _get_request(url, headers=headers)
     if r.encoding is None:
         r.encoding = 'utf8'  # Do not rely on guessing from request
 
     return r.text
 
 
+    ## ISBN support
+
 def isbn2bibtex(isbn):
     """Return a bibtex string of metadata from an ISBN"""
 
     url = 'http://www.ottobib.com/isbn/{}/bibtex'.format(isbn)
-    r = requests.get(url)
+    r = _get_request(url)
     soup = BeautifulSoup(r.text, "html.parser")
     citation = soup.find("textarea").text
 
     return citation
 
+    # Note: apparently ottobib.com uses caracter modifiers for accents instead
+    # of the correct unicode characters. TODO: Should we convert them?
 
-def arxiv2bibtex(arxiv_id):
-    """Return a bibtex string of metadata from an arXiv ID"""
 
-    url = 'https://export.arxiv.org/api/query?id_list=' + arxiv_id
-    r = requests.get(url)
+    ## arXiv support
+
+_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
+           'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
+
+def _is_arxiv_oldstyle(arxiv_id):
+    return re.match(r"(arXiv\:)?[a-z\-]+\/[0-9]+(v[0-9]+)?", arxiv_id) is not None
+
+def _extract_arxiv_id(entry):
+    pattern = r"http[s]?://arxiv.org/abs/(?P<entry_id>.+)"
+    return re.search(pattern, entry['id']).groupdict()['entry_id']
+
+
+def arxiv2bibtex(arxiv_id, try_doi=True, ui=None):
+    """Return a bibtex string of metadata from an arXiv ID
+
+    :param arxiv_id: arXiv id, with or without the `arXiv:` prefix and version
+                     suffix (e.g. `v1`). Old an new style are accepted. Here are
+                     example of accepted identifiers: `1510.00322`,
+                     `arXiv:1510.00322`, `0901.0512`, `arXiv:0901.0512`,
+                     `hep-ph/9409201` or `arXiv:hep-ph/9409201`.
+                     Note that the `arXiv:` prefix will be automatically
+                     removed, and the version suffix automatically added if
+                     missing.
+    :param try_doi:  if a DOI is referenced in the arXiv metadata,
+                     try to download it instead. If that fails for any reason,
+                     falls back to the arXiv, with a warning message, if the
+                     UI is provided.
+    :param ui:       if not None, will display a warning if the doi request
+                     fails.
+    """
+    ## handle errors
+    url = 'https://export.arxiv.org/api/query?id_list={}'.format(arxiv_id)
+    try:
+        r = requests.get(url)
+        if r.status_code == 400:  # bad request
+            msg = ("the arXiv server returned a bad request error. The "
+                   "arXiv id {} is possibly invalid or malformed.".format(arxiv_id))
+            raise ReferenceNotFoundError(msg)
+        r.raise_for_status()  # raise an exception for HTTP errors:
+                              # 401, 404, 400 if `ui` is None, etc.
+    except requests.exceptions.RequestException as e:
+        msg = ("connection error while retrieving arXiv data for "
+               "'{}': {}".format(arxiv_id, e))
+        raise ReferenceNotFoundError(msg)
+
+    # print("TEXT = '{}'".format(r.text))
     feed = feedparser.parse(r.text)
-    entry = feed.entries[0]
+    if len(feed.entries) == 0:  # no results.
+        msg = "no results for arXiv id {}".format(arxiv_id)
+        raise ReferenceNotFoundError(msg)
+    if len(feed.entries) > 1:  # I don't know how that could happen, but let's
+                               # be ready for it.
+        results = '\n'.join('{}. {}'.format(i, entry['title'])
+                            for entry in feed.entries)
+        msg = ("multiple results for arXiv id {}:\n{}\nThis is unexpected. "
+               "Please submit an issue at "
+               "https://github.com/pubs/pubs/issues").format(arxiv_id, choices)
+        raise ReferenceNotFoundError(msg)
 
-    if 'title' not in entry:
-        raise ReferenceNotFoundException('arXiv ID not found.')
-    elif 'arxiv_doi' in entry:
-        bibtex = doi2bibtex(entry['arxiv_doi'])
-    else:
-        # Create a bibentry from the metadata.
-        db = BibDatabase()
-        author_str = ' and '.join(
-            [author['name'] for author in entry['authors']])
-        db.entries = [{
-            'ENTRYTYPE': 'article',
-            'ID': arxiv_id,
-            'author': author_str,
-            'title': entry['title'],
-            'year': str(entry['published_parsed'].tm_year),
-            'Eprint': arxiv_id,
-        }]
-        bibtex = bibtexparser.dumps(db)
+    entry = feed.entries[0]
+    if 'arxiv.org/api/errors' in entry['id']:  # server is returning an error message.
+        msg = 'the arXiv server returned an error message: {}'.format(entry['summary'])
+        raise ReferenceNotFoundError(msg)
+    # import pprint
+    # pprint.pprint(entry)
+
+
+    ## try to return a doi instead of the arXiv reference
+    if try_doi and 'arxiv_doi' in entry:
+        try:
+            return doi2bibtex(entry['arxiv_doi'])
+        except ReferenceNotFoundError as e:
+            if ui is not None:
+                ui.warning(str(e))
+
+    ## create a bibentry from the arXiv response.
+    db = BibDatabase()
+    entry_id = _extract_arxiv_id(entry)
+    author_str = ' and '.join(
+        [author['name'] for author in entry['authors']])
+    db.entries = [{
+        'ENTRYTYPE': 'article',
+        'ID': entry_id,
+        'author': author_str,
+        'title': entry['title'],
+        'year': str(entry['published_parsed'].tm_year),
+        'month': _months[entry['published_parsed'].tm_mon-1],
+        'eprint': entry_id,
+        'eprinttype': 'arxiv',
+        'date': entry['published'], # not really standard, but a resolution more
+                                    # granular than months is increasinlgy relevant.
+        'url': entry['link'],
+        'urldate': datetime.datetime.utcnow().isoformat(timespec='seconds') + 'Z' # can't hurt.
+    }]
+    # we don't add eprintclass for old-style ids, as it is in the id already.
+    if not _is_arxiv_oldstyle(entry_id):
+        db.entries[0]['eprintclass'] = entry['arxiv_primary_category']['term']
+    if 'arxiv_doi' in entry:
+        db.entries[0]['arxiv_doi'] = arxiv_doi
+
+    bibtex = bibtexparser.dumps(db)
     return bibtex
+
+if __name__ == '__main__':
+    print(arxiv2bibtex("0704.0010"))
+    print(arxiv2bibtex("0704.010*"))
+#    print(arxiv2bibtex("quant-ph/0703266"))
diff --git a/setup.py b/setup.py
index 9e99694..e1fd8a9 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ setup(
         },
 
     install_requires=['pyyaml', 'bibtexparser>=1.0', 'python-dateutil',
-                      'requests', 'configobj', 'beautifulsoup4'],
+                      'requests', 'configobj', 'beautifulsoup4', 'feedparser'],
     tests_require=['pyfakefs>=2.7', 'mock'],
     extras_require={'autocompletion': ['argcomplete'],
                     },
diff --git a/tests/test_apis.py b/tests/test_apis.py
index c2893cc..2df5289 100644
--- a/tests/test_apis.py
+++ b/tests/test_apis.py
@@ -7,7 +7,8 @@ import dotdot
 
 from pubs.p3 import ustr
 from pubs.endecoder import EnDecoder
-from pubs.apis import arxiv2bibtex, doi2bibtex, isbn2bibtex
+from pubs.apis import arxiv2bibtex, doi2bibtex, isbn2bibtex, _is_arxiv_oldstyle, _extract_arxiv_id
+
 
 
 class TestDOI2Bibtex(unittest.TestCase):
@@ -84,6 +85,23 @@ class TestArxiv2Bibtex(unittest.TestCase):
                 entry['title'],
                 'The entropy formula for the Ricci flow and its geometric applications')
 
+    def test_oldstyle_pattern(self):
+        """Test that we can accurately differentiate between old and new style arXiv ids."""
+        # old-style arXiv ids
+        for arxiv_id in ['cs/9301113', 'math/9201277v3', 'astro-ph/9812133',
+                         'cond-mat/0604612', 'hep-ph/0702007v10', 'arXiv:physics/9403001'
+                        ]:
+            self.assertTrue(_is_arxiv_oldstyle(arxiv_id))
+        # new-style arXiv ids
+        for arxiv_id in ['1808.00954', 'arXiv:1808.00953', '1808.0953',
+                         '1808.00954v1', 'arXiv:1808.00953v2', '1808.0953v42']:
+            self.assertFalse(_is_arxiv_oldstyle(arxiv_id))
 
-# Note: apparently ottobib.com uses caracter modifiers for accents instead
-# of the correct unicode characters. TODO: Should we convert them?
+    def test_extract_id(self):
+        """Test that ids are correctly extracted"""
+        self.assertEqual(_extract_arxiv_id({'id': "http://arxiv.org/abs/0704.0010v1"}), "0704.0010v1")
+        self.assertEqual(_extract_arxiv_id({'id': "https://arxiv.org/abs/0704.0010v1"}), "0704.0010v1")
+        self.assertEqual(_extract_arxiv_id({'id': "https://arxiv.org/abs/astro-ph/9812133v2"}), "astro-ph/9812133v2")
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)