|
|
@ -1,4 +1,6 @@
|
|
|
|
"""Interface for Remote Bibliographic APIs"""
|
|
|
|
"""Interface for Remote Bibliographic APIs"""
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
import requests
|
|
|
|
import bibtexparser
|
|
|
|
import bibtexparser
|
|
|
@ -7,7 +9,7 @@ import feedparser
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ReferenceNotFoundException(Exception):
|
|
|
|
class ReferenceNotFoundError(Exception):
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -50,53 +52,156 @@ def get_bibentry_from_api(id_str, id_type, rp):
|
|
|
|
return bibentry
|
|
|
|
return bibentry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_request(url, headers=None):
|
|
|
|
|
|
|
|
"""GET requests to a url. Return the `requests` object.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:raise ConnectionError: if anything goes bad (connection refused, timeout
|
|
|
|
|
|
|
|
http status error (401, 404, etc)).
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
|
|
|
raise ReferenceNotFoundError(e.args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## DOI support
|
|
|
|
|
|
|
|
|
|
|
|
def doi2bibtex(doi):
|
|
|
|
def doi2bibtex(doi):
|
|
|
|
"""Return a bibtex string of metadata from a DOI"""
|
|
|
|
"""Return a bibtex string of metadata from a DOI"""
|
|
|
|
|
|
|
|
|
|
|
|
url = 'http://dx.doi.org/{}'.format(doi)
|
|
|
|
url = 'http://dx.doi.org/{}'.format(doi)
|
|
|
|
headers = {'accept': 'application/x-bibtex'}
|
|
|
|
headers = {'accept': 'application/x-bibtex'}
|
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
r = _get_request(url, headers=headers)
|
|
|
|
if r.encoding is None:
|
|
|
|
if r.encoding is None:
|
|
|
|
r.encoding = 'utf8' # Do not rely on guessing from request
|
|
|
|
r.encoding = 'utf8' # Do not rely on guessing from request
|
|
|
|
|
|
|
|
|
|
|
|
return r.text
|
|
|
|
return r.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## ISBN support
|
|
|
|
|
|
|
|
|
|
|
|
def isbn2bibtex(isbn):
|
|
|
|
def isbn2bibtex(isbn):
|
|
|
|
"""Return a bibtex string of metadata from an ISBN"""
|
|
|
|
"""Return a bibtex string of metadata from an ISBN"""
|
|
|
|
|
|
|
|
|
|
|
|
url = 'http://www.ottobib.com/isbn/{}/bibtex'.format(isbn)
|
|
|
|
url = 'http://www.ottobib.com/isbn/{}/bibtex'.format(isbn)
|
|
|
|
r = requests.get(url)
|
|
|
|
r = _get_request(url)
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
citation = soup.find("textarea").text
|
|
|
|
citation = soup.find("textarea").text
|
|
|
|
|
|
|
|
|
|
|
|
return citation
|
|
|
|
return citation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Note: apparently ottobib.com uses caracter modifiers for accents instead
|
|
|
|
|
|
|
|
# of the correct unicode characters. TODO: Should we convert them?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## arXiv support
|
|
|
|
|
|
|
|
|
|
|
|
def arxiv2bibtex(arxiv_id):
|
|
|
|
_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
|
|
|
|
"""Return a bibtex string of metadata from an arXiv ID"""
|
|
|
|
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
|
|
|
|
|
|
|
|
|
|
|
|
url = 'https://export.arxiv.org/api/query?id_list=' + arxiv_id
|
|
|
|
def _is_arxiv_oldstyle(arxiv_id):
|
|
|
|
r = requests.get(url)
|
|
|
|
return re.match(r"(arXiv\:)?[a-z\-]+\/[0-9]+(v[0-9]+)?", arxiv_id) is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_arxiv_id(entry):
|
|
|
|
|
|
|
|
pattern = r"http[s]?://arxiv.org/abs/(?P<entry_id>.+)"
|
|
|
|
|
|
|
|
return re.search(pattern, entry['id']).groupdict()['entry_id']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def arxiv2bibtex(arxiv_id, try_doi=True, ui=None):
|
|
|
|
|
|
|
|
"""Return a bibtex string of metadata from an arXiv ID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param arxiv_id: arXiv id, with or without the `arXiv:` prefix and version
|
|
|
|
|
|
|
|
suffix (e.g. `v1`). Old an new style are accepted. Here are
|
|
|
|
|
|
|
|
example of accepted identifiers: `1510.00322`,
|
|
|
|
|
|
|
|
`arXiv:1510.00322`, `0901.0512`, `arXiv:0901.0512`,
|
|
|
|
|
|
|
|
`hep-ph/9409201` or `arXiv:hep-ph/9409201`.
|
|
|
|
|
|
|
|
Note that the `arXiv:` prefix will be automatically
|
|
|
|
|
|
|
|
removed, and the version suffix automatically added if
|
|
|
|
|
|
|
|
missing.
|
|
|
|
|
|
|
|
:param try_doi: if a DOI is referenced in the arXiv metadata,
|
|
|
|
|
|
|
|
try to download it instead. If that fails for any reason,
|
|
|
|
|
|
|
|
falls back to the arXiv, with a warning message, if the
|
|
|
|
|
|
|
|
UI is provided.
|
|
|
|
|
|
|
|
:param ui: if not None, will display a warning if the doi request
|
|
|
|
|
|
|
|
fails.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
## handle errors
|
|
|
|
|
|
|
|
url = 'https://export.arxiv.org/api/query?id_list={}'.format(arxiv_id)
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
r = requests.get(url)
|
|
|
|
|
|
|
|
if r.status_code == 400: # bad request
|
|
|
|
|
|
|
|
msg = ("the arXiv server returned a bad request error. The "
|
|
|
|
|
|
|
|
"arXiv id {} is possibly invalid or malformed.".format(arxiv_id))
|
|
|
|
|
|
|
|
raise ReferenceNotFoundError(msg)
|
|
|
|
|
|
|
|
r.raise_for_status() # raise an exception for HTTP errors:
|
|
|
|
|
|
|
|
# 401, 404, 400 if `ui` is None, etc.
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
|
|
|
msg = ("connection error while retrieving arXiv data for "
|
|
|
|
|
|
|
|
"'{}': {}".format(arxiv_id, e))
|
|
|
|
|
|
|
|
raise ReferenceNotFoundError(msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print("TEXT = '{}'".format(r.text))
|
|
|
|
feed = feedparser.parse(r.text)
|
|
|
|
feed = feedparser.parse(r.text)
|
|
|
|
entry = feed.entries[0]
|
|
|
|
if len(feed.entries) == 0: # no results.
|
|
|
|
|
|
|
|
msg = "no results for arXiv id {}".format(arxiv_id)
|
|
|
|
|
|
|
|
raise ReferenceNotFoundError(msg)
|
|
|
|
|
|
|
|
if len(feed.entries) > 1: # I don't know how that could happen, but let's
|
|
|
|
|
|
|
|
# be ready for it.
|
|
|
|
|
|
|
|
results = '\n'.join('{}. {}'.format(i, entry['title'])
|
|
|
|
|
|
|
|
for entry in feed.entries)
|
|
|
|
|
|
|
|
msg = ("multiple results for arXiv id {}:\n{}\nThis is unexpected. "
|
|
|
|
|
|
|
|
"Please submit an issue at "
|
|
|
|
|
|
|
|
"https://github.com/pubs/pubs/issues").format(arxiv_id, choices)
|
|
|
|
|
|
|
|
raise ReferenceNotFoundError(msg)
|
|
|
|
|
|
|
|
|
|
|
|
if 'title' not in entry:
|
|
|
|
entry = feed.entries[0]
|
|
|
|
raise ReferenceNotFoundException('arXiv ID not found.')
|
|
|
|
if 'arxiv.org/api/errors' in entry['id']: # server is returning an error message.
|
|
|
|
elif 'arxiv_doi' in entry:
|
|
|
|
msg = 'the arXiv server returned an error message: {}'.format(entry['summary'])
|
|
|
|
bibtex = doi2bibtex(entry['arxiv_doi'])
|
|
|
|
raise ReferenceNotFoundError(msg)
|
|
|
|
else:
|
|
|
|
# import pprint
|
|
|
|
# Create a bibentry from the metadata.
|
|
|
|
# pprint.pprint(entry)
|
|
|
|
db = BibDatabase()
|
|
|
|
|
|
|
|
author_str = ' and '.join(
|
|
|
|
|
|
|
|
[author['name'] for author in entry['authors']])
|
|
|
|
## try to return a doi instead of the arXiv reference
|
|
|
|
db.entries = [{
|
|
|
|
if try_doi and 'arxiv_doi' in entry:
|
|
|
|
'ENTRYTYPE': 'article',
|
|
|
|
try:
|
|
|
|
'ID': arxiv_id,
|
|
|
|
return doi2bibtex(entry['arxiv_doi'])
|
|
|
|
'author': author_str,
|
|
|
|
except ReferenceNotFoundError as e:
|
|
|
|
'title': entry['title'],
|
|
|
|
if ui is not None:
|
|
|
|
'year': str(entry['published_parsed'].tm_year),
|
|
|
|
ui.warning(str(e))
|
|
|
|
'Eprint': arxiv_id,
|
|
|
|
|
|
|
|
}]
|
|
|
|
## create a bibentry from the arXiv response.
|
|
|
|
bibtex = bibtexparser.dumps(db)
|
|
|
|
db = BibDatabase()
|
|
|
|
|
|
|
|
entry_id = _extract_arxiv_id(entry)
|
|
|
|
|
|
|
|
author_str = ' and '.join(
|
|
|
|
|
|
|
|
[author['name'] for author in entry['authors']])
|
|
|
|
|
|
|
|
db.entries = [{
|
|
|
|
|
|
|
|
'ENTRYTYPE': 'article',
|
|
|
|
|
|
|
|
'ID': entry_id,
|
|
|
|
|
|
|
|
'author': author_str,
|
|
|
|
|
|
|
|
'title': entry['title'],
|
|
|
|
|
|
|
|
'year': str(entry['published_parsed'].tm_year),
|
|
|
|
|
|
|
|
'month': _months[entry['published_parsed'].tm_mon-1],
|
|
|
|
|
|
|
|
'eprint': entry_id,
|
|
|
|
|
|
|
|
'eprinttype': 'arxiv',
|
|
|
|
|
|
|
|
'date': entry['published'], # not really standard, but a resolution more
|
|
|
|
|
|
|
|
# granular than months is increasinlgy relevant.
|
|
|
|
|
|
|
|
'url': entry['link'],
|
|
|
|
|
|
|
|
'urldate': datetime.datetime.utcnow().isoformat(timespec='seconds') + 'Z' # can't hurt.
|
|
|
|
|
|
|
|
}]
|
|
|
|
|
|
|
|
# we don't add eprintclass for old-style ids, as it is in the id already.
|
|
|
|
|
|
|
|
if not _is_arxiv_oldstyle(entry_id):
|
|
|
|
|
|
|
|
db.entries[0]['eprintclass'] = entry['arxiv_primary_category']['term']
|
|
|
|
|
|
|
|
if 'arxiv_doi' in entry:
|
|
|
|
|
|
|
|
db.entries[0]['arxiv_doi'] = arxiv_doi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bibtex = bibtexparser.dumps(db)
|
|
|
|
return bibtex
|
|
|
|
return bibtex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
print(arxiv2bibtex("0704.0010"))
|
|
|
|
|
|
|
|
print(arxiv2bibtex("0704.010*"))
|
|
|
|
|
|
|
|
# print(arxiv2bibtex("quant-ph/0703266"))
|
|
|
|