From 81f266159367258ec02d23990be618a1671086b4 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Thu, 13 Jul 2017 21:08:06 -0400 Subject: [PATCH 1/6] Cosmit --- pubs/content.py | 10 ++++++---- pubs/filebroker.py | 2 +- pubs/p3.py | 1 + pubs/repo.py | 15 +++++++++------ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pubs/content.py b/pubs/content.py index 796302b..ee5f2af 100644 --- a/pubs/content.py +++ b/pubs/content.py @@ -52,16 +52,17 @@ def _open(path, mode): else: return open(system_path(path), mode, encoding='utf-8') + def check_file(path, fail=True): syspath = system_path(path) - return (_check_system_path_exists(syspath, fail=fail) - and _check_system_path_is(u'isfile', syspath, fail=fail)) + return (_check_system_path_exists(syspath, fail=fail) and + _check_system_path_is(u'isfile', syspath, fail=fail)) def check_directory(path, fail=True): syspath = system_path(path) - return (_check_system_path_exists(syspath, fail=fail) - and _check_system_path_is(u'isdir', syspath, fail=fail)) + return (_check_system_path_exists(syspath, fail=fail) and + _check_system_path_is(u'isdir', syspath, fail=fail)) def read_text_file(filepath, fail=True): @@ -79,6 +80,7 @@ def read_text_file(filepath, fail=True): return content + def read_binary_file(filepath, fail=True): check_file(filepath, fail=fail) with _open(filepath, 'rb') as f: diff --git a/pubs/filebroker.py b/pubs/filebroker.py index 47a8bd8..c20ceea 100644 --- a/pubs/filebroker.py +++ b/pubs/filebroker.py @@ -13,7 +13,7 @@ def filter_filename(filename, ext): """ Return the filename without the extension if the extension matches ext. Otherwise return None """ - pattern ='.*\{}$'.format(ext) + pattern = '.*\{}$'.format(ext) if re.match(pattern, filename) is not None: return filename[:-len(ext)] diff --git a/pubs/p3.py b/pubs/p3.py index 93229d6..9c9082c 100644 --- a/pubs/p3.py +++ b/pubs/p3.py @@ -39,6 +39,7 @@ else: # for test_usecase. def _get_raw_stdout(): return sys.stdout.buffer + def _get_raw_stderr(): return sys.stderr.buffer diff --git a/pubs/repo.py b/pubs/repo.py index 32026d3..fd942b0 100644 --- a/pubs/repo.py +++ b/pubs/repo.py @@ -110,8 +110,9 @@ class Repository(object): self.databroker.remove_note(citekey, self.conf['main']['note_extension'], silent=True) except IOError: - pass # FIXME: if IOError is about being unable to - # remove the file, we need to issue an error. + # FIXME: if IOError is about being unable to + # remove the file, we need to issue an error. + pass self.citekeys.remove(citekey) self.databroker.remove(citekey) @@ -126,16 +127,18 @@ class Repository(object): p.docpath = None self.push_paper(p, overwrite=True, event=False) except IOError: - pass # FIXME: if IOError is about being unable to - # remove the file, we need to issue an error.I + # FIXME: if IOError is about being unable to + # remove the file, we need to issue an error.I + pass def pull_docpath(self, citekey): try: p = self.pull_paper(citekey) return self.databroker.real_docpath(p.docpath) except IOError: - pass # FIXME: if IOError is about being unable to - # remove the file, we need to issue an error.I + # FIXME: if IOError is about being unable to + # remove the file, we need to issue an error.I + pass def rename_paper(self, paper, new_citekey=None, old_citekey=None): if old_citekey is None: From cf596206b0781bba82e5a76e8cc998cadd4cedf3 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Thu, 13 Jul 2017 21:13:02 -0400 Subject: [PATCH 2/6] FIX: Encode unicode before writing to file in python2. This is necessary because _open returns a file descriptor in binary mode for python2. --- pubs/content.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pubs/content.py b/pubs/content.py index ee5f2af..c529b77 100644 --- a/pubs/content.py +++ b/pubs/content.py @@ -94,7 +94,16 @@ def remove_file(filepath): def write_file(filepath, data, mode='w'): + """Write data to file. + + Data should be unicode except when binary mode is selected, + in which case data is expected to be binary. + """ check_directory(os.path.dirname(filepath)) + if 'b' not in mode and sys.version_info < (3,): + # _open returns in binary mode for python2 + # Data must be encoded + data = data.encode('utf-8') with _open(filepath, mode) as f: f.write(data) From 7b19a9dcdd09d7d51a2c51ccc8e5b1d0c77cd4ce Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Thu, 13 Jul 2017 21:16:26 -0400 Subject: [PATCH 3/6] Forces utf8 when no encoding in header for DOI request. The default behavior from the requests library is to use the guess from chardet which is not always reliable while doi.org seems to always return utf8 encoded data. It's unlikely that this changes without also updating the header. --- pubs/apis.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pubs/apis.py b/pubs/apis.py index 8c95126..0a0e449 100644 --- a/pubs/apis.py +++ b/pubs/apis.py @@ -3,15 +3,19 @@ import requests from bs4 import BeautifulSoup + def doi2bibtex(doi): """Return a bibtex string of metadata from a DOI""" url = 'http://dx.doi.org/{}'.format(doi) headers = {'accept': 'application/x-bibtex'} r = requests.get(url, headers=headers) + if r.encoding is None: + r.encoding = 'utf8' # Do not rely on guessing from request return r.text + def isbn2bibtex(isbn): """Return a bibtex string of metadata from a DOI""" From 983d1892e07b8329ab65ab5543a56d2a959e7fac Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Thu, 13 Jul 2017 21:36:28 -0400 Subject: [PATCH 4/6] Removes unused imports. --- pubs/filebroker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pubs/filebroker.py b/pubs/filebroker.py index c20ceea..f834e8e 100644 --- a/pubs/filebroker.py +++ b/pubs/filebroker.py @@ -3,8 +3,7 @@ import re from .p3 import urlparse from .content import (check_file, check_directory, read_text_file, write_file, - system_path, check_content, content_type, get_content, - copy_content) + system_path, check_content, copy_content) from . import content From 38fc68adbc7fe63664fc65340e1339f296815688 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Thu, 13 Jul 2017 21:37:35 -0400 Subject: [PATCH 5/6] Refactors filebroker to remove file duplication. --- pubs/filebroker.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/pubs/filebroker.py b/pubs/filebroker.py index f834e8e..b040313 100644 --- a/pubs/filebroker.py +++ b/pubs/filebroker.py @@ -8,6 +8,10 @@ from .content import (check_file, check_directory, read_text_file, write_file, from . import content +META_EXT = '.yaml' +BIB_EXT = '.bib' + + def filter_filename(filename, ext): """ Return the filename without the extension if the extension matches ext. Otherwise return None @@ -47,6 +51,12 @@ class FileBroker(object): if not check_directory(self.bibdir, fail=False): os.mkdir(system_path(self.bibdir)) + def bib_path(self, citekey): + return os.path.join(self.bibdir, citekey + BIB_EXT) + + def meta_path(self, citekey): + return os.path.join(self.metadir, citekey + META_EXT) + def pull_cachefile(self, filename): filepath = os.path.join(self.cachedir, filename) return content.read_binary_file(filepath) @@ -57,35 +67,31 @@ class FileBroker(object): def mtime_metafile(self, citekey): try: - filepath = os.path.join(self.metadir, citekey + '.yaml') + filepath = self.meta_path(citekey) return os.path.getmtime(filepath) except OSError: raise IOError("'{}' not found.".format(filepath)) def mtime_bibfile(self, citekey): try: - filepath = os.path.join(self.bibdir, citekey + '.bib') + filepath = self.bib_path(citekey) return os.path.getmtime(filepath) except OSError: raise IOError("'{}' not found.".format(filepath)) def pull_metafile(self, citekey): - filepath = os.path.join(self.metadir, citekey + '.yaml') - return read_text_file(filepath) + return read_text_file(self.meta_path(citekey)) def pull_bibfile(self, citekey): - filepath = os.path.join(self.bibdir, citekey + '.bib') - return read_text_file(filepath) + return read_text_file(self.bib_path(citekey)) def push_metafile(self, citekey, metadata): """Put content to disk. Will gladly override anything standing in its way.""" - filepath = os.path.join(self.metadir, citekey + '.yaml') - write_file(filepath, metadata) + write_file(self.meta_path(citekey), metadata) def push_bibfile(self, citekey, bibdata): """Put content to disk. Will gladly override anything standing in its way.""" - filepath = os.path.join(self.bibdir, citekey + '.bib') - write_file(filepath, bibdata) + write_file(self.bib_path(citekey), bibdata) def push(self, citekey, metadata, bibdata): """Put content to disk. Will gladly override anything standing in its way.""" @@ -93,10 +99,10 @@ class FileBroker(object): self.push_bibfile(citekey, bibdata) def remove(self, citekey): - metafilepath = os.path.join(self.metadir, citekey + '.yaml') + metafilepath = self.meta_path(citekey) if check_file(metafilepath): os.remove(system_path(metafilepath)) - bibfilepath = os.path.join(self.bibdir, citekey + '.bib') + bibfilepath = self.bib_path(citekey) if check_file(bibfilepath): os.remove(system_path(bibfilepath)) @@ -105,16 +111,16 @@ class FileBroker(object): :param meta_check: if True, will return if both the bibtex and the meta file exists. """ - does_exists = check_file(os.path.join(self.bibdir, citekey + '.bib'), fail=False) + does_exists = check_file(self.bib_path(citekey), fail=False) if meta_check: - meta_exists = check_file(os.path.join(self.metadir, citekey + '.yaml'), fail=False) + meta_exists = check_file(self.meta_path(citekey), fail=False) does_exists = does_exists and meta_exists return does_exists def listing(self, filestats=True): metafiles = [] for filename in os.listdir(system_path(self.metadir)): - citekey = filter_filename(filename, '.yaml') + citekey = filter_filename(filename, META_EXT) if citekey is not None: if filestats: stats = os.stat(system_path(os.path.join(self.metadir, filename))) @@ -124,7 +130,7 @@ class FileBroker(object): bibfiles = [] for filename in os.listdir(system_path(self.bibdir)): - citekey = filter_filename(filename, '.bib') + citekey = filter_filename(filename, BIB_EXT) if citekey is not None: if filestats: stats = os.stat(system_path(os.path.join(self.bibdir, filename))) From da6f07c6c9f8be94a6740b6207cf0d437fd1e04a Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Sat, 22 Jul 2017 15:42:12 -0400 Subject: [PATCH 6/6] Adds basic API test for DOI and ISBN. --- tests/test_apis.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tests/test_apis.py diff --git a/tests/test_apis.py b/tests/test_apis.py new file mode 100644 index 0000000..087f32f --- /dev/null +++ b/tests/test_apis.py @@ -0,0 +1,64 @@ +# coding: utf8 + +from __future__ import unicode_literals +import unittest + +import dotdot + +from pubs.p3 import ustr +from pubs.endecoder import EnDecoder +from pubs.apis import doi2bibtex, isbn2bibtex + + +class TestDOI2Bibtex(unittest.TestCase): + + def setUp(self): + self.endecoder = EnDecoder() + + def test_unicode(self): + bib = doi2bibtex('10.1007/BF01700692') + self.assertIsInstance(bib, ustr) + self.assertIn('Kurt Gödel', bib) + + def test_parses_to_bibtex(self): + bib = doi2bibtex('10.1007/BF01700692') + b = self.endecoder.decode_bibdata(bib) + self.assertEqual(len(b), 1) + entry = b[list(b)[0]] + self.assertEqual(entry['author'][0], 'Gödel, Kurt') + self.assertEqual(entry['title'], + 'Über formal unentscheidbare Sätze der Principia ' + 'Mathematica und verwandter Systeme I') + + def test_parse_fails_on_incorrect_DOI(self): + bib = doi2bibtex('999999') + with self.assertRaises(ValueError): + self.endecoder.decode_bibdata(bib) + + +class TestISBN2Bibtex(unittest.TestCase): + + def setUp(self): + self.endecoder = EnDecoder() + + def test_unicode(self): + bib = isbn2bibtex('9782081336742') + self.assertIsInstance(bib, ustr) + self.assertIn('Poincaré, Henri', bib) + + def test_parses_to_bibtex(self): + bib = isbn2bibtex('9782081336742') + b = self.endecoder.decode_bibdata(bib) + self.assertEqual(len(b), 1) + entry = b[list(b)[0]] + self.assertEqual(entry['author'][0], 'Poincaré, Henri') + self.assertEqual(entry['title'], 'La science et l\'hypothèse') + + def test_parse_fails_on_incorrect_ISBN(self): + bib = doi2bibtex('9' * 13) + with self.assertRaises(ValueError): + self.endecoder.decode_bibdata(bib) + + +# Note: apparently ottobib.com uses caracter modifiers for accents instead +# of the correct unicode characters. TODO: Should we convert them?