diff --git a/pubs/bibstruct.py b/pubs/bibstruct.py index 9b85728..fafa533 100644 --- a/pubs/bibstruct.py +++ b/pubs/bibstruct.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import unicodedata import re @@ -12,7 +14,7 @@ CITEKEY_EXCLUDE_RE = re.compile('[%s]' % re.escape(CONTROL_CHARS + CITEKEY_FORBIDDEN_CHARS)) def str2citekey(s): - key = unicodedata.normalize('NFKD', ustr(s)).encode('ascii', 'ignore') + key = unicodedata.normalize('NFKD', ustr(s)).encode('ascii', 'ignore').decode() key = CITEKEY_EXCLUDE_RE.sub('', key) # Normalize chars and remove non-ascii return key diff --git a/pubs/content.py b/pubs/content.py index f4b9c03..bc33c4f 100644 --- a/pubs/content.py +++ b/pubs/content.py @@ -1,4 +1,5 @@ import os +import io import subprocess import tempfile import shutil @@ -6,6 +7,16 @@ import shutil from .p3 import urlparse, HTTPConnection, urlopen +"""Conventions: + - all files are written using utf8 encoding by default, + - any function returning or variable containing byte data should + be prefixed by 'byte_' +""" + + +ENCODING = 'utf8' + + # files i/o def _check_system_path_exists(path, fail=True): @@ -42,14 +53,14 @@ def check_directory(path, fail=True): def read_file(filepath): check_file(filepath) - with open(system_path(filepath), 'r') as f: - s = f.read() - return s + with io.open(system_path(filepath), 'r', encoding=ENCODING) as f: + content = f.read() + return content def write_file(filepath, data): check_directory(os.path.dirname(filepath)) - with open(system_path(filepath), 'w') as f: + with io.open(system_path(filepath), 'w', encoding=ENCODING) as f: f.write(data) @@ -83,13 +94,25 @@ def check_content(path): return check_file(path) +def _get_byte_url_content(path, ui=None): + if ui is not None: + ui.print_('dowloading {}'.format(path)) + response = urlopen(path) + return response.read() + + +def _dump_byte_url_content(source, target): + """Caution: this method does not test for existing destination. + """ + byte_content = _get_byte_url_content(source) + with io.open(target, 'wb') as f: + f.write(byte_content) + + def get_content(path, ui=None): """Will be useful when we need to get content from url""" if content_type(path) == 'url': - if ui is not None: - ui.print_('dowloading {}'.format(path)) - response = urlopen(path) - return response.read() + return _get_byte_url_content(path, ui=ui).decode(encoding=ENCODING) else: return read_file(path) @@ -102,24 +125,27 @@ def move_content(source, target, overwrite = False): shutil.move(source, target) -def copy_content(source, target, overwrite = False): +def copy_content(source, target, overwrite=False): if source == target: return if not overwrite and os.path.exists(target): - raise IOError('target file exists') - shutil.copy(source, target) + raise IOError('{} file exists.'.format(target)) + if content_type(source) == 'url': + _dump_byte_url_content(source, target) + else: + shutil.copy(source, target) -def editor_input(editor, initial="", suffix='.tmp'): +def editor_input(editor, initial=u"", suffix='.tmp'): """Use an editor to get input""" + str_initial = initial.encode(ENCODING) # TODO: make it a configuration item with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp_file: tfile_name = temp_file.name - temp_file.write(initial) + temp_file.write(str_initial) cmd = editor.split() # this enable editor command with option, e.g. gvim -f cmd.append(tfile_name) subprocess.call(cmd) - with open(tfile_name) as temp_file: - content = temp_file.read() + content = read_file(tfile_name) os.remove(tfile_name) return content @@ -127,11 +153,9 @@ def editor_input(editor, initial="", suffix='.tmp'): def edit_file(editor, path_to_file, temporary=True): if temporary: check_file(path_to_file, fail=True) - with open(path_to_file) as f: - content = f.read() + content = read_file(path_to_file) content = editor_input(editor, content) - with open(path_to_file, 'w') as f: - f.write(content) + write_file(path_to_file, content) else: cmd = editor.split() # this enable editor command with option, e.g. gvim -f cmd.append(path_to_file) diff --git a/pubs/endecoder.py b/pubs/endecoder.py index ebbe0eb..c2f2616 100644 --- a/pubs/endecoder.py +++ b/pubs/endecoder.py @@ -1,10 +1,7 @@ from __future__ import print_function, absolute_import, division, unicode_literals -import copy -try: - import cStringIO as StringIO -except ImportError: - import StringIO +import io +import copy try: import bibtexparser as bp @@ -16,6 +13,12 @@ except ImportError: import yaml + +"""Important notice: + All functions and methods in this file assume and produce unicode data. +""" + + def sanitize_citekey(record): record['id'] = record['id'].strip('\n') return record @@ -56,7 +59,8 @@ class EnDecoder(object): """ def encode_metadata(self, metadata): - return yaml.safe_dump(metadata, allow_unicode=True, encoding='UTF-8', indent = 4) + return yaml.safe_dump(metadata, allow_unicode=True, + encoding=None, indent = 4) def decode_metadata(self, metadata_raw): return yaml.safe_load(metadata_raw) @@ -97,9 +101,7 @@ class EnDecoder(object): def decode_bibdata(self, bibdata_raw): """""" - bibdata_rawutf8 = bibdata_raw - #bibdata_rawutf8 = unicode(bibdata_raw, 'utf8') # FIXME this doesn't work - bibdata_stream = StringIO.StringIO(bibdata_rawutf8) + bibdata_stream = io.StringIO(bibdata_raw) return self._decode_bibdata(bibdata_stream) def _decode_bibdata(self, bibdata_stream): diff --git a/pubs/filebroker.py b/pubs/filebroker.py index 09e6079..29faea2 100644 --- a/pubs/filebroker.py +++ b/pubs/filebroker.py @@ -3,7 +3,8 @@ import re from .p3 import urlparse from .content import (check_file, check_directory, read_file, write_file, - system_path, check_content, content_type, get_content) + system_path, check_content, content_type, get_content, + copy_content) def filter_filename(filename, ext): @@ -161,12 +162,7 @@ class DocBroker(object): target_path = '{}://{}'.format(self.scheme, citekey + os.path.splitext(source_path)[-1]) full_target_path = self.real_docpath(target_path) - if not overwrite and check_file(full_target_path, fail=False): - raise IOError('{} file exists.'.format(full_target_path)) - - doc_content = get_content(full_source_path) - write_file(full_target_path, doc_content) - + copy_content(full_source_path, full_target_path, overwrite=overwrite) return target_path def remove_doc(self, docpath, silent=True): diff --git a/pubs/p3.py b/pubs/p3.py index 9257eab..a73276c 100644 --- a/pubs/p3.py +++ b/pubs/p3.py @@ -2,7 +2,6 @@ import sys if sys.version_info[0] == 2: import ConfigParser as configparser - import StringIO as io input = raw_input ustr = unicode uchr = unichr @@ -11,7 +10,6 @@ if sys.version_info[0] == 2: from httplib import HTTPConnection else: import configparser - import io ustr = str uchr = chr from urllib.parse import urlparse @@ -19,5 +17,11 @@ else: from http.client import HTTPConnection configparser = configparser -io = io input = input + + +def isbasestr(obj): + try: + return isinstance(obj, basestring) + except NameError: + return isinstance(obj, str) or isinstance(obj, bytes) diff --git a/tests/fake_env.py b/tests/fake_env.py index 9e007fa..c6fa458 100644 --- a/tests/fake_env.py +++ b/tests/fake_env.py @@ -1,4 +1,5 @@ import sys +import io import os import shutil import glob @@ -9,7 +10,7 @@ import fake_filesystem import fake_filesystem_shutil import fake_filesystem_glob -from pubs.p3 import io, input +from pubs.p3 import input from pubs import content, filebroker # code for fake fs @@ -19,6 +20,7 @@ real_open = open real_file = file real_shutil = shutil real_glob = glob +real_io = io @@ -35,6 +37,57 @@ real_glob = glob # return ml +ENCODING = 'utf8' + + +class UnicodeStringIOWrapper(object): + """This is a hack because fake_filesystem does not provied mock of io. + """ + + override = ['read', 'readline', 'readlines', 'write', 'writelines'] + + def __init__(self, strio): + self._strio = strio # The real StringIO + + def __getattr__(self, name): + if name in UnicodeStringIOWrapper.override: + return object.__getattribute__(self, name) + else: + return self._strio.__getattribute__(name) + + def read(self, *args): + return self._strio.read(*args).decode(ENCODING) + + def readline(self, *args): + return self._strio.readline(*args).decode(ENCODING) + + def readlines(self, *args): + return [l.decode(ENCODING) for l in self._strio.readlines(*args)] + + def write(self, data): + self._strio.write(data.encode(ENCODING)) + + def writelines(self, data): + self._strio.write([l.encode(ENCODING) for l in data]) + + def __enter__(self): + self._strio.__enter__() + return self + + def __exit__(self, *args): + return self._strio.__exit__(*args) + + +class FakeIO(object): + + def __init__(self, fake_open): + self.fake_open = fake_open + + def open(self, *args, **kwargs): + # Forces python3 mode for FakeFileOpen + fakefs_stringio = self.fake_open.Call(*args, **kwargs) + return UnicodeStringIOWrapper(fakefs_stringio) + def create_fake_fs(module_list): @@ -43,6 +96,7 @@ def create_fake_fs(module_list): fake_open = fake_filesystem.FakeFileOpen(fake_fs) fake_shutil = fake_filesystem_shutil.FakeShutilModule(fake_fs) fake_glob = fake_filesystem_glob.FakeGlobModule(fake_fs) + fake_io = FakeIO(fake_open) fake_fs.CreateDirectory(fake_os.path.expanduser('~')) @@ -51,19 +105,23 @@ def create_fake_fs(module_list): sys.modules['os'] = fake_os sys.modules['shutil'] = fake_shutil sys.modules['glob'] = fake_glob + sys.modules['io'] = fake_io for md in module_list: md.os = fake_os md.shutil = fake_shutil md.open = fake_open md.file = fake_open + md.io = fake_io return {'fs': fake_fs, 'os': fake_os, 'open': fake_open, + 'io': fake_io, 'shutil': fake_shutil, 'glob': fake_glob} + def unset_fake_fs(module_list): try: __builtins__.open = real_open @@ -75,12 +133,14 @@ def unset_fake_fs(module_list): sys.modules['os'] = real_os sys.modules['shutil'] = real_shutil sys.modules['glob'] = real_glob + sys.modules['io'] = real_io for md in module_list: md.os = real_os md.shutil = real_shutil md.open = real_open md.file = real_file + md.io = real_io def copy_dir(fs, real_dir, fake_dir = None): @@ -91,7 +151,7 @@ def copy_dir(fs, real_dir, fake_dir = None): real_path = os.path.abspath(real_os.path.join(real_dir, filename)) fake_path = fs['os'].path.join(fake_dir, filename) if real_os.path.isfile(real_path): - with real_open(real_path, 'r') as f: + with real_open(real_path, 'rb') as f: fs['fs'].CreateFile(fake_path, contents=f.read()) if real_os.path.isdir(real_path): fs['fs'].CreateDirectory(fake_path) @@ -103,8 +163,8 @@ def copy_dir(fs, real_dir, fake_dir = None): def redirect(f): def newf(*args, **kwargs): old_stderr, old_stdout = sys.stderr, sys.stdout - stdout = io.StringIO() - stderr = io.StringIO() + stdout = io.BytesIO() + stderr = io.BytesIO() sys.stdout, sys.stderr = stdout, stderr try: return f(*args, **kwargs), stdout, stderr diff --git a/tests/fixtures.py b/tests/fixtures.py index 7ec3571..6793a8f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from __future__ import unicode_literals + import dotdot from pubs import endecoder import str_fixtures @@ -19,6 +21,8 @@ doe_bib = """ year = "2013"} """ +dummy_metadata = {'docfile': 'docsdir://hop.la', 'tags': set(['a', 'b'])} + franny_bibdata = coder.decode_bibdata(franny_bib) franny_bibentry = franny_bibdata['Franny1961'] diff --git a/tests/str_fixtures.py b/tests/str_fixtures.py index b227ea6..3480f77 100644 --- a/tests/str_fixtures.py +++ b/tests/str_fixtures.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + bibtex_external0 = """ @techreport{Page99, number = {1999-66}, diff --git a/tests/test_endecoder.py b/tests/test_endecoder.py index 4dd72be..96e91b1 100644 --- a/tests/test_endecoder.py +++ b/tests/test_endecoder.py @@ -6,7 +6,9 @@ import yaml import dotdot from pubs import endecoder +from pubs.p3 import ustr +from fixtures import dummy_metadata from str_fixtures import bibtex_raw0, metadata_raw0, turing_bib @@ -21,8 +23,19 @@ def compare_yaml_str(s1, s2): class TestEnDecode(unittest.TestCase): - def test_endecode_bibtex(self): + def test_encode_bibtex_is_unicode(self): + decoder = endecoder.EnDecoder() + entry = decoder.decode_bibdata(bibtex_raw0) + bibraw = decoder.encode_bibdata(entry) + self.assertIsInstance(bibraw, ustr) + def test_encode_metadat_is_unicode(self): + decoder = endecoder.EnDecoder() + data = decoder.encode_metadata(dummy_metadata) + self.assertIsInstance(data, ustr) + + + def test_endecode_bibtex(self): decoder = endecoder.EnDecoder() entry = decoder.decode_bibdata(bibtex_raw0) diff --git a/tests/test_usecase.py b/tests/test_usecase.py index 5032ffb..e72c0d6 100644 --- a/tests/test_usecase.py +++ b/tests/test_usecase.py @@ -1,4 +1,5 @@ -from __future__ import print_function +from __future__ import print_function, unicode_literals + import unittest import re import os @@ -85,7 +86,7 @@ class CommandTestCase(unittest.TestCase): else: if capture_output: - assert isinstance(cmd, str) + assert p3.isbasestr(cmd) _, stdout, stderr = fake_env.redirect(pubs_cmd.execute)(cmd.split()) else: pubs_cmd.execute(cmd.split()) @@ -213,13 +214,13 @@ class TestList(DataCommandTestCase): class TestUsecase(DataCommandTestCase): def test_first(self): - correct = ['Initializing pubs in /paper_first\n', - '', - '[Page99] Page, Lawrence et al. "The PageRank Citation Ranking: Bringing Order to the Web." (1999) \n', - '', - '', - 'search network\n', - '[Page99] Page, Lawrence et al. "The PageRank Citation Ranking: Bringing Order to the Web." (1999) search network\n' + correct = [b'Initializing pubs in /paper_first\n', + b'', + b'[Page99] Page, Lawrence et al. "The PageRank Citation Ranking: Bringing Order to the Web." (1999) \n', + b'', + b'', + b'search network\n', + b'[Page99] Page, Lawrence et al. "The PageRank Citation Ranking: Bringing Order to the Web." (1999) search network\n' ] cmds = ['pubs init -p paper_first/', @@ -305,9 +306,9 @@ class TestUsecase(DataCommandTestCase): ('pubs add', [str_fixtures.bibtex_external0]), 'pubs export Page99', ] - outs = self.execute_cmds(cmds) - self.assertEqual(endecoder.EnDecoder().decode_bibdata(outs[2]), fixtures.page_bibdata) + out_raw = outs[2].decode() + self.assertEqual(endecoder.EnDecoder().decode_bibdata(out_raw), fixtures.page_bibdata) def test_import(self): cmds = ['pubs init',