Merge pull request #116 from pubs/feat/unicode_comparison
Fixes unicode comparison
This commit is contained in:
commit
0d38c73965
@ -15,6 +15,8 @@ def parser(subparsers, conf):
|
||||
default=None, dest='case_sensitive')
|
||||
parser.add_argument('-I', '--force-case', action='store_true',
|
||||
dest='case_sensitive')
|
||||
parser.add_argument('--strict', action='store_true', default=False,
|
||||
help='force strict unicode comparison of query')
|
||||
parser.add_argument('-a', '--alphabetical', action='store_true',
|
||||
dest='alphabetical', default=False,
|
||||
help='lexicographic order on the citekeys.')
|
||||
@ -34,7 +36,8 @@ def command(conf, args):
|
||||
ui = get_ui()
|
||||
rp = repo.Repository(conf)
|
||||
papers = filter(get_paper_filter(args.query,
|
||||
case_sensitive=args.case_sensitive),
|
||||
case_sensitive=args.case_sensitive,
|
||||
strict=args.strict),
|
||||
rp.all_papers())
|
||||
if args.nodocs:
|
||||
papers = [p for p in papers if p.docpath is None]
|
||||
|
@ -1,3 +1,7 @@
|
||||
import unicodedata
|
||||
|
||||
from bibtexparser.latexenc import latex_to_unicode
|
||||
|
||||
from . import bibstruct
|
||||
|
||||
|
||||
@ -19,30 +23,48 @@ class InvalidQuery(ValueError):
|
||||
|
||||
|
||||
class QueryFilter(object):
|
||||
"""Filter function for papers built from a given query.
|
||||
|
||||
def __init__(self, query, case_sensitive=None):
|
||||
:param case_sensitive: forces case (in)sensitivity; default is to
|
||||
only be sensitive if query contains uppercase
|
||||
:param strict: if set to True, compares the raw unicode without
|
||||
interpreting latex commands, normalizing unicode, or ignoring case.
|
||||
(Overrides the case_sensitive parameter.)
|
||||
"""
|
||||
|
||||
def __init__(self, query, case_sensitive=None, strict=False):
|
||||
if case_sensitive is None:
|
||||
case_sensitive = not query.islower()
|
||||
self.case = case_sensitive
|
||||
self.query = self._lower(query)
|
||||
self.strict = strict
|
||||
self.query = self._normalize(query)
|
||||
|
||||
def __call__(self, paper):
|
||||
raise NotImplementedError
|
||||
|
||||
def _lower(self, s):
|
||||
return s if self.case else s.lower()
|
||||
def _is_query_in(self, field_value):
|
||||
return self.query in self._normalize(field_value)
|
||||
|
||||
def _normalize(self, s):
|
||||
if self.strict:
|
||||
return s
|
||||
else:
|
||||
s = unicodedata.normalize('NFC', latex_to_unicode(s))
|
||||
# Note: in theory latex_to_unicode also normalizes
|
||||
return s if self.case else s.lower()
|
||||
|
||||
|
||||
class FieldFilter(QueryFilter):
|
||||
"""Generic filter of form `query in paper['field']`"""
|
||||
|
||||
def __init__(self, field, query, case_sensitive=None):
|
||||
super(FieldFilter, self).__init__(query, case_sensitive=case_sensitive)
|
||||
def __init__(self, field, query, case_sensitive=None, strict=False):
|
||||
super(FieldFilter, self).__init__(query, case_sensitive=case_sensitive,
|
||||
strict=strict)
|
||||
self.field = field
|
||||
|
||||
def __call__(self, paper):
|
||||
return (self.field in paper.bibdata and
|
||||
self.query in self._lower(paper.bibdata[self.field]))
|
||||
self._is_query_in(paper.bibdata[self.field]))
|
||||
|
||||
|
||||
class AuthorFilter(QueryFilter):
|
||||
@ -52,14 +74,14 @@ class AuthorFilter(QueryFilter):
|
||||
if 'author' not in paper.bibdata:
|
||||
return False
|
||||
else:
|
||||
return any([self.query in self._lower(bibstruct.author_last(author))
|
||||
return any([self._is_query_in(bibstruct.author_last(author))
|
||||
for author in paper.bibdata['author']])
|
||||
|
||||
|
||||
class TagFilter(QueryFilter):
|
||||
|
||||
def __call__(self, paper):
|
||||
return any([self.query in self._lower(t) for t in paper.tags])
|
||||
return any([self._is_query_in(t) for t in paper.tags])
|
||||
|
||||
|
||||
class YearFilter(QueryFilter):
|
||||
@ -67,7 +89,7 @@ class YearFilter(QueryFilter):
|
||||
whose year field is set and can be converted to an int.
|
||||
"""
|
||||
|
||||
def __init__(self, query, case_sensitive=None):
|
||||
def __init__(self, query):
|
||||
split = query.split('-')
|
||||
self.start = self._str_to_year(split[0])
|
||||
if len(split) == 1:
|
||||
@ -111,25 +133,29 @@ def _get_field_value(query_block):
|
||||
return (field, value)
|
||||
|
||||
|
||||
def _query_block_to_filter(query_block, case_sensitive=None):
|
||||
def _query_block_to_filter(query_block, case_sensitive=None, strict=False):
|
||||
field, value = _get_field_value(query_block)
|
||||
if field == 'tag':
|
||||
return TagFilter(value, case_sensitive=case_sensitive)
|
||||
return TagFilter(value, case_sensitive=case_sensitive, strict=strict)
|
||||
elif field == 'author':
|
||||
return AuthorFilter(value, case_sensitive=case_sensitive)
|
||||
return AuthorFilter(value, case_sensitive=case_sensitive,
|
||||
strict=strict)
|
||||
elif field == 'year':
|
||||
return YearFilter(value)
|
||||
else:
|
||||
return FieldFilter(field, value, case_sensitive=case_sensitive)
|
||||
return FieldFilter(field, value, case_sensitive=case_sensitive,
|
||||
strict=strict)
|
||||
|
||||
|
||||
# TODO implement search by type of document
|
||||
def get_paper_filter(query, case_sensitive=None):
|
||||
def get_paper_filter(query, case_sensitive=None, strict=False):
|
||||
"""If case_sensitive is not given, only check case if query
|
||||
is not lowercase.
|
||||
|
||||
:args query: list of query blocks (strings)
|
||||
"""
|
||||
filters = [_query_block_to_filter(query_block, case_sensitive=case_sensitive)
|
||||
filters = [_query_block_to_filter(query_block,
|
||||
case_sensitive=case_sensitive,
|
||||
strict=strict)
|
||||
for query_block in query]
|
||||
return lambda paper: all([f(paper) for f in filters])
|
||||
|
@ -1,3 +1,6 @@
|
||||
# coding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import unittest
|
||||
|
||||
import dotdot
|
||||
@ -19,7 +22,7 @@ class TestAuthorFilter(unittest.TestCase):
|
||||
|
||||
def test_fails_if_no_author(self):
|
||||
no_doe = doe_paper.deepcopy()
|
||||
no_doe.bibentry['author'] = []
|
||||
no_doe.bibentry['Doe2013']['author'] = []
|
||||
self.assertFalse(AuthorFilter('whatever')(no_doe))
|
||||
|
||||
def test_match_case(self):
|
||||
@ -84,6 +87,36 @@ class TestCheckField(unittest.TestCase):
|
||||
self.assertFalse(
|
||||
FieldFilter('title', 'nice', case_sensitive=True)(doe_paper))
|
||||
|
||||
def test_latex_enc(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "{G}r{\\\"u}n"
|
||||
self.assertTrue(
|
||||
FieldFilter('title', 'Grün')(latexenc_paper))
|
||||
self.assertTrue(
|
||||
FieldFilter('title', 'Gr{\\\"u}n')(latexenc_paper))
|
||||
|
||||
def test_normalize_unicode(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "Jalape\u00f1o"
|
||||
self.assertTrue(
|
||||
FieldFilter('title', "Jalapen\u0303o")(latexenc_paper))
|
||||
|
||||
def test_strict(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "Jalape\u00f1o"
|
||||
self.assertFalse(FieldFilter('title', "Jalapen\u0303o",
|
||||
strict=True)(latexenc_paper))
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "{G}ros"
|
||||
self.assertFalse(
|
||||
FieldFilter('title', "Gros", strict=True)(latexenc_paper))
|
||||
|
||||
def test_strict_implies_case(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "Gros"
|
||||
self.assertFalse(
|
||||
FieldFilter('title', "gros", case_sensitive=False,
|
||||
strict=True)(latexenc_paper))
|
||||
|
||||
|
||||
class TestCheckQueryBlock(unittest.TestCase):
|
||||
|
||||
@ -122,6 +155,26 @@ class TestFilterPaper(unittest.TestCase):
|
||||
self.assertFalse(get_paper_filter(['author:doe', 'year:2014-'])(doe_paper))
|
||||
self.assertFalse(get_paper_filter(['author:doee', 'year:2014'])(doe_paper))
|
||||
|
||||
def test_latex_enc(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "{E}l Ni{\~n}o"
|
||||
latexenc_paper.bibentry['Doe2013']['author'][0] = "Erd\H{o}s, Paul"
|
||||
self.assertTrue(get_paper_filter(['title:El'])(latexenc_paper))
|
||||
self.assertTrue(get_paper_filter(['title:Niño'])(latexenc_paper))
|
||||
self.assertTrue(get_paper_filter(['author:erdős'])(latexenc_paper))
|
||||
self.assertTrue(get_paper_filter(['title:{E}l'])(latexenc_paper))
|
||||
|
||||
def test_normalize_unicode(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "{E}l Ni{\~n}o"
|
||||
self.assertTrue(get_paper_filter(['title:Nin\u0303o'])(latexenc_paper))
|
||||
|
||||
def test_strict(self):
|
||||
latexenc_paper = doe_paper.deepcopy()
|
||||
latexenc_paper.bibentry['Doe2013']['title'] = "El Ni{\~n}o"
|
||||
self.assertFalse(get_paper_filter(
|
||||
['title:Nin\u0303o'], strict=True)(latexenc_paper))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -398,6 +398,33 @@ class TestList(DataCommandTestCase):
|
||||
outs = self.execute_cmds(cmds)
|
||||
self.assertEqual(0 + 1, len(outs[-1].split('\n')))
|
||||
|
||||
def test_list_strict_forces_case(self):
|
||||
cmds = ['pubs init',
|
||||
'pubs list',
|
||||
'pubs import data/',
|
||||
'pubs list --ignore-case --strict title:lAnguage',
|
||||
]
|
||||
outs = self.execute_cmds(cmds)
|
||||
self.assertEqual(0 + 1, len(outs[-1].split('\n')))
|
||||
|
||||
def test_list_strict(self):
|
||||
cmds = ['pubs init',
|
||||
'pubs list',
|
||||
'pubs import data/',
|
||||
'pubs list --strict title:{L}anguage',
|
||||
]
|
||||
outs = self.execute_cmds(cmds)
|
||||
self.assertEqual(0 + 1, len(outs[-1].split('\n')))
|
||||
|
||||
def test_list_latex_protection(self):
|
||||
cmds = ['pubs init',
|
||||
'pubs list',
|
||||
'pubs import data/',
|
||||
'pubs list title:{L}anguage',
|
||||
]
|
||||
outs = self.execute_cmds(cmds)
|
||||
self.assertEqual(1 + 1, len(outs[-1].split('\n')))
|
||||
|
||||
|
||||
class TestTag(DataCommandTestCase):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user