commit
42d75d1230
@ -0,0 +1,24 @@
|
||||
# if you want to setup your environment for development of the pytest code,
|
||||
# doing `pip install -r dev_requirements.txt` is the single thing you have to do.
|
||||
# Alternatively, and perhaps more conveniently, running `python setup.py test`
|
||||
# will do the same *and* run the tests, but without installing the packages on
|
||||
# the system.
|
||||
# Note that if you introduce a new dependency, you need to add it here and, more
|
||||
# importantly, to the setup.py script so that it is taken into account when
|
||||
# installing from PyPi.
|
||||
|
||||
-e .
|
||||
pyyaml
|
||||
bibtexparser>=1.0
|
||||
python-dateutil
|
||||
requests
|
||||
configobj
|
||||
beautifulsoup4
|
||||
feedparser
|
||||
six
|
||||
|
||||
# those are the additional packages required to run the tests
|
||||
pyfakefs
|
||||
ddt
|
||||
mock
|
||||
pytest # optional (python setup.py test works without it), but possible nonetheless
|
@ -1,27 +1,198 @@
|
||||
"""Interface for Remote Bibliographic APIs"""
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
import bibtexparser
|
||||
from bibtexparser.bibdatabase import BibDatabase
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def doi2bibtex(doi):
|
||||
class ReferenceNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_bibentry_from_api(id_str, id_type, try_doi=True, ui=None):
|
||||
"""Return a bibtex string from various ID methods.
|
||||
|
||||
This is a wrapper around functions that will return a bibtex string given
|
||||
one of:
|
||||
|
||||
* DOI
|
||||
* IBSN
|
||||
* arXiv ID
|
||||
|
||||
Args:
|
||||
id_str: A string with the ID.
|
||||
id_type: Name of the ID type. Must be one of `doi`, `isbn`, or `arxiv`.
|
||||
rp: A `Repository` object.
|
||||
ui: A UI object.
|
||||
|
||||
Returns:
|
||||
A bibtex string.
|
||||
|
||||
Raises:
|
||||
ValueError: if `id_type` is not one of `doi`, `isbn`, or `arxiv`.
|
||||
apis.ReferenceNotFoundException: if no valid reference could be found.
|
||||
"""
|
||||
|
||||
id_fns = {
|
||||
'doi': doi2bibtex,
|
||||
'isbn': isbn2bibtex,
|
||||
'arxiv': arxiv2bibtex,
|
||||
}
|
||||
|
||||
if id_type not in id_fns.keys():
|
||||
raise ValueError('id_type must be one of `doi`, `isbn`, or `arxiv`.')
|
||||
|
||||
bibentry_raw = id_fns[id_type](id_str, try_doi=try_doi, ui=ui)
|
||||
endecoder.EnDecoder().decode_bibdata(bibentry_raw)
|
||||
if bibentry is None:
|
||||
raise ReferenceNotFoundException(
|
||||
'invalid {} {} or unable to retrieve bibfile from it.'.format(id_type, id_str))
|
||||
return bibentry
|
||||
|
||||
|
||||
|
||||
def _get_request(url, headers=None):
|
||||
"""GET requests to a url. Return the `requests` object.
|
||||
|
||||
:raise ConnectionError: if anything goes bad (connection refused, timeout
|
||||
http status error (401, 404, etc)).
|
||||
"""
|
||||
try:
|
||||
r = requests.get(url, headers=headers)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise ReferenceNotFoundError(e.args)
|
||||
|
||||
|
||||
## DOI support
|
||||
|
||||
def doi2bibtex(doi, **kwargs):
|
||||
"""Return a bibtex string of metadata from a DOI"""
|
||||
|
||||
url = 'http://dx.doi.org/{}'.format(doi)
|
||||
url = 'https://dx.doi.org/{}'.format(doi)
|
||||
headers = {'accept': 'application/x-bibtex'}
|
||||
r = requests.get(url, headers=headers)
|
||||
r = _get_request(url, headers=headers)
|
||||
if r.encoding is None:
|
||||
r.encoding = 'utf8' # Do not rely on guessing from request
|
||||
|
||||
return r.text
|
||||
|
||||
|
||||
def isbn2bibtex(isbn):
|
||||
## ISBN support
|
||||
|
||||
|
||||
def isbn2bibtex(isbn, **kwargs):
|
||||
"""Return a bibtex string of metadata from an ISBN"""
|
||||
|
||||
url = 'http://www.ottobib.com/isbn/{}/bibtex'.format(isbn)
|
||||
r = requests.get(url)
|
||||
url = 'https://www.ottobib.com/isbn/{}/bibtex'.format(isbn)
|
||||
r = _get_request(url)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
citation = soup.find("textarea").text
|
||||
|
||||
return citation
|
||||
|
||||
# Note: apparently ottobib.com uses caracter modifiers for accents instead
|
||||
# of the correct unicode characters. TODO: Should we convert them?
|
||||
|
||||
|
||||
## arXiv support
|
||||
|
||||
_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
|
||||
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
|
||||
|
||||
def _is_arxiv_oldstyle(arxiv_id):
|
||||
return re.match(r"(arXiv\:)?[a-z\-]+\/[0-9]+(v[0-9]+)?", arxiv_id) is not None
|
||||
|
||||
def _extract_arxiv_id(entry):
|
||||
pattern = r"http[s]?://arxiv.org/abs/(?P<entry_id>.+)"
|
||||
return re.search(pattern, entry['id']).groupdict()['entry_id']
|
||||
|
||||
|
||||
def arxiv2bibtex(arxiv_id, try_doi=True, ui=None):
|
||||
"""Return a bibtex string of metadata from an arXiv ID
|
||||
|
||||
:param arxiv_id: arXiv id, with or without the `arXiv:` prefix and version
|
||||
suffix (e.g. `v1`). Old an new style are accepted. Here are
|
||||
example of accepted identifiers: `1510.00322`,
|
||||
`arXiv:1510.00322`, `0901.0512`, `arXiv:0901.0512`,
|
||||
`hep-ph/9409201` or `arXiv:hep-ph/9409201`.
|
||||
Note that the `arXiv:` prefix will be automatically
|
||||
removed, and the version suffix automatically added if
|
||||
missing.
|
||||
:param try_doi: if a DOI is referenced in the arXiv metadata,
|
||||
try to download it instead. If that fails for any reason,
|
||||
falls back to the arXiv, with a warning message, if the
|
||||
UI is provided.
|
||||
:param ui: if not None, will display a warning if the doi request
|
||||
fails.
|
||||
"""
|
||||
## handle errors
|
||||
url = 'https://export.arxiv.org/api/query?id_list={}'.format(arxiv_id)
|
||||
try:
|
||||
r = requests.get(url)
|
||||
if r.status_code == 400: # bad request
|
||||
msg = ("the arXiv server returned a bad request error. The "
|
||||
"arXiv id {} is possibly invalid or malformed.".format(arxiv_id))
|
||||
raise ReferenceNotFoundError(msg)
|
||||
r.raise_for_status() # raise an exception for HTTP errors:
|
||||
# 401, 404, 400 if `ui` is None, etc.
|
||||
except requests.exceptions.RequestException as e:
|
||||
msg = ("connection error while retrieving arXiv data for "
|
||||
"'{}': {}".format(arxiv_id, e))
|
||||
raise ReferenceNotFoundError(msg)
|
||||
|
||||
feed = feedparser.parse(r.text)
|
||||
if len(feed.entries) == 0: # no results.
|
||||
msg = "no results for arXiv id {}".format(arxiv_id)
|
||||
raise ReferenceNotFoundError(msg)
|
||||
if len(feed.entries) > 1: # I don't know how that could happen, but let's
|
||||
# be ready for it.
|
||||
results = '\n'.join('{}. {}'.format(i, entry['title'])
|
||||
for entry in feed.entries)
|
||||
msg = ("multiple results for arXiv id {}:\n{}\nThis is unexpected. "
|
||||
"Please submit an issue at "
|
||||
"https://github.com/pubs/pubs/issues").format(arxiv_id, choices)
|
||||
raise ReferenceNotFoundError(msg)
|
||||
|
||||
entry = feed.entries[0]
|
||||
|
||||
## try to return a doi instead of the arXiv reference
|
||||
if try_doi and 'arxiv_doi' in entry:
|
||||
try:
|
||||
return doi2bibtex(entry['arxiv_doi'])
|
||||
except ReferenceNotFoundError as e:
|
||||
if ui is not None:
|
||||
ui.warning(str(e))
|
||||
|
||||
## create a bibentry from the arXiv response.
|
||||
db = BibDatabase()
|
||||
entry_id = _extract_arxiv_id(entry)
|
||||
author_str = ' and '.join(
|
||||
[author['name'] for author in entry['authors']])
|
||||
db.entries = [{
|
||||
'ENTRYTYPE': 'article',
|
||||
'ID': entry_id,
|
||||
'author': author_str,
|
||||
'title': entry['title'],
|
||||
'year': str(entry['published_parsed'].tm_year),
|
||||
'month': _months[entry['published_parsed'].tm_mon-1],
|
||||
'eprint': entry_id,
|
||||
'eprinttype': 'arxiv',
|
||||
'date': entry['published'], # not really standard, but a resolution more
|
||||
# granular than months is increasinlgy relevant.
|
||||
'url': entry['link'],
|
||||
'urldate': datetime.datetime.utcnow().isoformat() + 'Z' # can't hurt.
|
||||
}]
|
||||
# we don't add eprintclass for old-style ids, as it is in the id already.
|
||||
if not _is_arxiv_oldstyle(entry_id):
|
||||
db.entries[0]['eprintclass'] = entry['arxiv_primary_category']['term']
|
||||
if 'arxiv_doi' in entry:
|
||||
db.entries[0]['arxiv_doi'] = entry['arxiv_doi']
|
||||
|
||||
bibtex = bibtexparser.dumps(db)
|
||||
return bibtex
|
||||
|
@ -0,0 +1,33 @@
|
||||
from ..repo import Repository
|
||||
from ..uis import get_ui
|
||||
from .. import color
|
||||
|
||||
|
||||
def parser(subparsers, conf):
|
||||
parser = subparsers.add_parser(
|
||||
'statistics',
|
||||
help="show statistics on the repository.")
|
||||
return parser
|
||||
|
||||
|
||||
def command(conf, args):
|
||||
ui = get_ui()
|
||||
rp = Repository(conf)
|
||||
papers = list(rp.all_papers())
|
||||
|
||||
paper_count = len(papers)
|
||||
doc_count = sum([0 if p.docpath is None else 1 for p in papers])
|
||||
tag_count = len(list(rp.get_tags()))
|
||||
papers_with_tags = sum([0 if p.tags else 1 for p in papers])
|
||||
|
||||
ui.message(color.dye_out('Repository statistics:', 'bold'))
|
||||
ui.message('Total papers: {}, {} ({}) have a document attached'.format(
|
||||
color.dye_out('{:d}'.format(paper_count), 'bgreen'),
|
||||
color.dye_out('{:d}'.format(doc_count), 'bold'),
|
||||
'{:.0f}%'.format(100. * doc_count / paper_count),
|
||||
))
|
||||
ui.message('Total tags: {}, {} ({}) of papers have at least one tag'.format(
|
||||
color.dye_out('{:d}'.format(tag_count), 'bgreen'),
|
||||
color.dye_out('{:d}'.format(papers_with_tags), 'bold'),
|
||||
'{:.0f}%'.format(100. * papers_with_tags / paper_count),
|
||||
))
|
@ -1,6 +0,0 @@
|
||||
pyyaml
|
||||
bibtexparser>=1.0
|
||||
python-dateutil
|
||||
requests
|
||||
configobj
|
||||
beautifulsoup4
|
@ -0,0 +1,106 @@
|
||||
"""
|
||||
Mock the `requests.get` function, and handle collecting data to do so.
|
||||
|
||||
Three modes are available, and controlled via the `PUBS_TESTS_MODE` environment
|
||||
variable. To modify the variable, under linux or macos, do one of:
|
||||
$ export PUBS_TESTS_MODE=MOCK
|
||||
$ export PUBS_TESTS_MODE=COLLECT
|
||||
$ export PUBS_TESTS_MODE=ONLINE
|
||||
|
||||
The MOCK mode is the default one, active even if PUBS_TESTS_MODE has not been
|
||||
set. It uses saved data to run pubs units tests relying on the `requests.get`
|
||||
function without the need of an internet connection (it is also much faster).
|
||||
The prefected data is save in the `test_apis_data.pickle` file.
|
||||
|
||||
The COLLECT mode does real GET requests, and updates the `test_apis_data.pickle`
|
||||
file. It is needed if you add or modify the test relying on `requests.get`.
|
||||
|
||||
The ONLINE mode bypasses all this and use the original `requests.get` without
|
||||
accessing or updating the `test_apis_data.pickle` data. It might be useful when
|
||||
running tests on Travis for instance.
|
||||
"""
|
||||
|
||||
|
||||
import os
|
||||
import json
|
||||
import mock
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
_orgininal_requests_get = requests.get
|
||||
_collected_responses = []
|
||||
_data_filepath = os.path.join(os.path.dirname(__file__), 'test_apis_data.json')
|
||||
|
||||
|
||||
class MockingResponse:
|
||||
def __init__(self, text, status_code=200, error_msg=None):
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
self.error_msg = error_msg
|
||||
self.encoding = 'utf8'
|
||||
|
||||
def raise_for_status(self):
|
||||
if self.status_code != 200:
|
||||
raise requests.exceptions.RequestException(self.error_msg)
|
||||
|
||||
|
||||
def intercept_text(text):
|
||||
try:
|
||||
if '10.1103/PhysRevD.89.084044' in text:
|
||||
# replace with wrong DOI
|
||||
text = text.replace('PhysRevD', 'INVALIDDOI')
|
||||
except TypeError:
|
||||
if b'10.1103/PhysRevD.89.084044' in text:
|
||||
# replace with wrong DOI
|
||||
text = text.replace(b'PhysRevD', b'INVALIDDOI')
|
||||
|
||||
return text
|
||||
|
||||
|
||||
mode = os.environ.get('PUBS_TESTS_MODE', 'MOCK')
|
||||
|
||||
if mode == 'MOCK':
|
||||
|
||||
with open(os.path.join(_data_filepath), 'r') as fd:
|
||||
_collected_responses = json.load(fd)
|
||||
|
||||
def mock_requests_get(*args, **kwargs):
|
||||
for args2, kwargs2, text, status_code, error_msg in _collected_responses:
|
||||
if list(args) == list(args2) and kwargs == kwargs2:
|
||||
return MockingResponse(text, status_code, error_msg)
|
||||
raise KeyError(('No stub data found for requests.get({}, {}).\n You may'
|
||||
' need to update the mock data. Look at the '
|
||||
'tests/mock_requests.py file for an explanation').format(args, kwargs))
|
||||
|
||||
elif mode == 'COLLECT':
|
||||
|
||||
def mock_requests_get(*args, **kwargs):
|
||||
text, status_code, error_msg = None, None, None
|
||||
try:
|
||||
r = _orgininal_requests_get(*args, **kwargs)
|
||||
text, status_code = r.text, r.status_code
|
||||
r.raise_for_status()
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = str(e)
|
||||
|
||||
text = intercept_text(text)
|
||||
_collected_responses.append((args, kwargs, text, status_code, error_msg))
|
||||
_save_collected_responses() # yes, we save everytime, because it's not
|
||||
# clear how to run once after all the tests
|
||||
# have run. If you figure it out...
|
||||
|
||||
return MockingResponse(text, status_code, error_msg)
|
||||
|
||||
def _save_collected_responses():
|
||||
with open(os.path.join(_data_filepath), 'w') as fd:
|
||||
json.dump(sorted(_collected_responses), fd, indent=2)
|
||||
|
||||
elif mode == 'ONLINE':
|
||||
def mock_requests_get(*args, **kwargs):
|
||||
# with mock.patch('requests.Response.text', new_callable=mock.PropertyMock) as mock_text:
|
||||
r = _orgininal_requests_get(*args, **kwargs)
|
||||
r._content = intercept_text(r.content)
|
||||
# print(r.content.__class__)
|
||||
# mock_text.return_value = intercept_text(r.text)
|
||||
return r
|
@ -1,5 +0,0 @@
|
||||
# those are the additional packages required to run the tests
|
||||
six
|
||||
pyfakefs
|
||||
ddt
|
||||
mock
|
File diff suppressed because one or more lines are too long
Loading…
Reference in new issue