From c87b89bf106e276ba51051024681b744239bae10 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Tue, 30 Jan 2018 17:37:11 -0500 Subject: [PATCH 1/6] Fix wrong test. --- tests/test_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_queries.py b/tests/test_queries.py index 245833b..59c71cd 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -19,7 +19,7 @@ class TestAuthorFilter(unittest.TestCase): def test_fails_if_no_author(self): no_doe = doe_paper.deepcopy() - no_doe.bibentry['author'] = [] + no_doe.bibentry['Doe2013']['author'] = [] self.assertFalse(AuthorFilter('whatever')(no_doe)) def test_match_case(self): From cff028d34508db8f4bb94dc0a374034e012cddee Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Tue, 30 Jan 2018 17:42:56 -0500 Subject: [PATCH 2/6] Minor refactoring of string comparison in query filters. --- pubs/query.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pubs/query.py b/pubs/query.py index baa17ca..52b96c9 100644 --- a/pubs/query.py +++ b/pubs/query.py @@ -29,6 +29,9 @@ class QueryFilter(object): def __call__(self, paper): raise NotImplementedError + def _is_query_in(self, field_value): + return self.query in self._lower(field_value) + def _lower(self, s): return s if self.case else s.lower() @@ -42,7 +45,7 @@ class FieldFilter(QueryFilter): def __call__(self, paper): return (self.field in paper.bibdata and - self.query in self._lower(paper.bibdata[self.field])) + self._is_query_in(paper.bibdata[self.field])) class AuthorFilter(QueryFilter): @@ -52,14 +55,14 @@ class AuthorFilter(QueryFilter): if 'author' not in paper.bibdata: return False else: - return any([self.query in self._lower(bibstruct.author_last(author)) + return any([self._is_query_in(bibstruct.author_last(author)) for author in paper.bibdata['author']]) class TagFilter(QueryFilter): def __call__(self, paper): - return any([self.query in self._lower(t) for t in paper.tags]) + return any([self._is_query_in(t) for t in paper.tags]) class YearFilter(QueryFilter): From 1bcbf65dd843fae38d72114f406c4a3ed22b5387 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Tue, 30 Jan 2018 17:56:00 -0500 Subject: [PATCH 3/6] Implements latex escape in queries. --- pubs/query.py | 8 +++++--- tests/test_queries.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pubs/query.py b/pubs/query.py index 52b96c9..40f3460 100644 --- a/pubs/query.py +++ b/pubs/query.py @@ -1,3 +1,4 @@ +from bibtexparser.latexenc import latex_to_unicode from . import bibstruct @@ -24,15 +25,16 @@ class QueryFilter(object): if case_sensitive is None: case_sensitive = not query.islower() self.case = case_sensitive - self.query = self._lower(query) + self.query = self._normalize(query) def __call__(self, paper): raise NotImplementedError def _is_query_in(self, field_value): - return self.query in self._lower(field_value) + return self.query in self._normalize(field_value) - def _lower(self, s): + def _normalize(self, s): + s = latex_to_unicode(s) return s if self.case else s.lower() diff --git a/tests/test_queries.py b/tests/test_queries.py index 59c71cd..0508794 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -84,6 +84,14 @@ class TestCheckField(unittest.TestCase): self.assertFalse( FieldFilter('title', 'nice', case_sensitive=True)(doe_paper)) + def test_latex_enc(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "{G}r{\\\"u}n" + self.assertTrue( + FieldFilter('title', 'Grün')(latexenc_paper)) + self.assertTrue( + FieldFilter('title', 'Gr{\\\"u}n')(latexenc_paper)) + class TestCheckQueryBlock(unittest.TestCase): @@ -122,6 +130,14 @@ class TestFilterPaper(unittest.TestCase): self.assertFalse(get_paper_filter(['author:doe', 'year:2014-'])(doe_paper)) self.assertFalse(get_paper_filter(['author:doee', 'year:2014'])(doe_paper)) + def test_latex_enc(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "{E}l Ni{\~n}o" + latexenc_paper.bibentry['Doe2013']['author'][0] = "Erd\H{o}s, Paul" + self.assertTrue(get_paper_filter(['title:El'])(latexenc_paper)) + self.assertTrue(get_paper_filter(['title:Niño'])(latexenc_paper)) + self.assertTrue(get_paper_filter(['author:erdős'])(latexenc_paper)) + if __name__ == '__main__': unittest.main() From f3ec9621ee8bc0d8ec2267fff6e97e5028f80892 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Wed, 31 Jan 2018 10:30:27 -0500 Subject: [PATCH 4/6] Adds and checks unicode normalization. Fixes #103. --- pubs/query.py | 6 +++++- tests/test_queries.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pubs/query.py b/pubs/query.py index 40f3460..68b3420 100644 --- a/pubs/query.py +++ b/pubs/query.py @@ -1,4 +1,7 @@ +import unicodedata + from bibtexparser.latexenc import latex_to_unicode + from . import bibstruct @@ -34,7 +37,8 @@ class QueryFilter(object): return self.query in self._normalize(field_value) def _normalize(self, s): - s = latex_to_unicode(s) + s = unicodedata.normalize('NFC', latex_to_unicode(s)) + # Note: in theory latex_to_unicode also normalizes return s if self.case else s.lower() diff --git a/tests/test_queries.py b/tests/test_queries.py index 0508794..e60c95f 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -92,6 +92,12 @@ class TestCheckField(unittest.TestCase): self.assertTrue( FieldFilter('title', 'Gr{\\\"u}n')(latexenc_paper)) + def test_normalize_unicode(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "Jalape\u00f1o" + self.assertTrue( + FieldFilter('title', "Jalapen\u0303o")(latexenc_paper)) + class TestCheckQueryBlock(unittest.TestCase): @@ -138,6 +144,11 @@ class TestFilterPaper(unittest.TestCase): self.assertTrue(get_paper_filter(['title:Niño'])(latexenc_paper)) self.assertTrue(get_paper_filter(['author:erdős'])(latexenc_paper)) + def test_normalize_unicode(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "{E}l Ni{\~n}o" + self.assertTrue(get_paper_filter(['title:Nin\u0303o'])(latexenc_paper)) + if __name__ == '__main__': unittest.main() From 248bf62317e3ca7a1e97e85afc864c995bdc0da2 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Wed, 31 Jan 2018 10:53:29 -0500 Subject: [PATCH 5/6] Fixes unicode for python2 in test_queries. --- tests/test_queries.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_queries.py b/tests/test_queries.py index e60c95f..ea92629 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -1,3 +1,6 @@ +# coding: utf8 + +from __future__ import unicode_literals import unittest import dotdot From 6e39cea4731fa33d28fd959a4a21143fde7eb785 Mon Sep 17 00:00:00 2001 From: Olivier Mangin Date: Sat, 3 Feb 2018 18:06:33 -0500 Subject: [PATCH 6/6] Adds `--strict` option to list to force strict unicode comparison. --- pubs/commands/list_cmd.py | 5 ++++- pubs/query.py | 43 +++++++++++++++++++++++++++------------ tests/test_queries.py | 23 +++++++++++++++++++++ tests/test_usecase.py | 27 ++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/pubs/commands/list_cmd.py b/pubs/commands/list_cmd.py index 2c1ec90..7b0b9ac 100644 --- a/pubs/commands/list_cmd.py +++ b/pubs/commands/list_cmd.py @@ -15,6 +15,8 @@ def parser(subparsers, conf): default=None, dest='case_sensitive') parser.add_argument('-I', '--force-case', action='store_true', dest='case_sensitive') + parser.add_argument('--strict', action='store_true', default=False, + help='force strict unicode comparison of query') parser.add_argument('-a', '--alphabetical', action='store_true', dest='alphabetical', default=False, help='lexicographic order on the citekeys.') @@ -34,7 +36,8 @@ def command(conf, args): ui = get_ui() rp = repo.Repository(conf) papers = filter(get_paper_filter(args.query, - case_sensitive=args.case_sensitive), + case_sensitive=args.case_sensitive, + strict=args.strict), rp.all_papers()) if args.nodocs: papers = [p for p in papers if p.docpath is None] diff --git a/pubs/query.py b/pubs/query.py index 68b3420..9e65eb1 100644 --- a/pubs/query.py +++ b/pubs/query.py @@ -23,11 +23,20 @@ class InvalidQuery(ValueError): class QueryFilter(object): + """Filter function for papers built from a given query. - def __init__(self, query, case_sensitive=None): + :param case_sensitive: forces case (in)sensitivity; default is to + only be sensitive if query contains uppercase + :param strict: if set to True, compares the raw unicode without + interpreting latex commands, normalizing unicode, or ignoring case. + (Overrides the case_sensitive parameter.) + """ + + def __init__(self, query, case_sensitive=None, strict=False): if case_sensitive is None: case_sensitive = not query.islower() self.case = case_sensitive + self.strict = strict self.query = self._normalize(query) def __call__(self, paper): @@ -37,16 +46,20 @@ class QueryFilter(object): return self.query in self._normalize(field_value) def _normalize(self, s): - s = unicodedata.normalize('NFC', latex_to_unicode(s)) - # Note: in theory latex_to_unicode also normalizes - return s if self.case else s.lower() + if self.strict: + return s + else: + s = unicodedata.normalize('NFC', latex_to_unicode(s)) + # Note: in theory latex_to_unicode also normalizes + return s if self.case else s.lower() class FieldFilter(QueryFilter): """Generic filter of form `query in paper['field']`""" - def __init__(self, field, query, case_sensitive=None): - super(FieldFilter, self).__init__(query, case_sensitive=case_sensitive) + def __init__(self, field, query, case_sensitive=None, strict=False): + super(FieldFilter, self).__init__(query, case_sensitive=case_sensitive, + strict=strict) self.field = field def __call__(self, paper): @@ -76,7 +89,7 @@ class YearFilter(QueryFilter): whose year field is set and can be converted to an int. """ - def __init__(self, query, case_sensitive=None): + def __init__(self, query): split = query.split('-') self.start = self._str_to_year(split[0]) if len(split) == 1: @@ -120,25 +133,29 @@ def _get_field_value(query_block): return (field, value) -def _query_block_to_filter(query_block, case_sensitive=None): +def _query_block_to_filter(query_block, case_sensitive=None, strict=False): field, value = _get_field_value(query_block) if field == 'tag': - return TagFilter(value, case_sensitive=case_sensitive) + return TagFilter(value, case_sensitive=case_sensitive, strict=strict) elif field == 'author': - return AuthorFilter(value, case_sensitive=case_sensitive) + return AuthorFilter(value, case_sensitive=case_sensitive, + strict=strict) elif field == 'year': return YearFilter(value) else: - return FieldFilter(field, value, case_sensitive=case_sensitive) + return FieldFilter(field, value, case_sensitive=case_sensitive, + strict=strict) # TODO implement search by type of document -def get_paper_filter(query, case_sensitive=None): +def get_paper_filter(query, case_sensitive=None, strict=False): """If case_sensitive is not given, only check case if query is not lowercase. :args query: list of query blocks (strings) """ - filters = [_query_block_to_filter(query_block, case_sensitive=case_sensitive) + filters = [_query_block_to_filter(query_block, + case_sensitive=case_sensitive, + strict=strict) for query_block in query] return lambda paper: all([f(paper) for f in filters]) diff --git a/tests/test_queries.py b/tests/test_queries.py index ea92629..d8cfa86 100644 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -101,6 +101,22 @@ class TestCheckField(unittest.TestCase): self.assertTrue( FieldFilter('title', "Jalapen\u0303o")(latexenc_paper)) + def test_strict(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "Jalape\u00f1o" + self.assertFalse(FieldFilter('title', "Jalapen\u0303o", + strict=True)(latexenc_paper)) + latexenc_paper.bibentry['Doe2013']['title'] = "{G}ros" + self.assertFalse( + FieldFilter('title', "Gros", strict=True)(latexenc_paper)) + + def test_strict_implies_case(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "Gros" + self.assertFalse( + FieldFilter('title', "gros", case_sensitive=False, + strict=True)(latexenc_paper)) + class TestCheckQueryBlock(unittest.TestCase): @@ -146,12 +162,19 @@ class TestFilterPaper(unittest.TestCase): self.assertTrue(get_paper_filter(['title:El'])(latexenc_paper)) self.assertTrue(get_paper_filter(['title:Niño'])(latexenc_paper)) self.assertTrue(get_paper_filter(['author:erdős'])(latexenc_paper)) + self.assertTrue(get_paper_filter(['title:{E}l'])(latexenc_paper)) def test_normalize_unicode(self): latexenc_paper = doe_paper.deepcopy() latexenc_paper.bibentry['Doe2013']['title'] = "{E}l Ni{\~n}o" self.assertTrue(get_paper_filter(['title:Nin\u0303o'])(latexenc_paper)) + def test_strict(self): + latexenc_paper = doe_paper.deepcopy() + latexenc_paper.bibentry['Doe2013']['title'] = "El Ni{\~n}o" + self.assertFalse(get_paper_filter( + ['title:Nin\u0303o'], strict=True)(latexenc_paper)) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_usecase.py b/tests/test_usecase.py index eb3f22a..9bcb6ec 100644 --- a/tests/test_usecase.py +++ b/tests/test_usecase.py @@ -398,6 +398,33 @@ class TestList(DataCommandTestCase): outs = self.execute_cmds(cmds) self.assertEqual(0 + 1, len(outs[-1].split('\n'))) + def test_list_strict_forces_case(self): + cmds = ['pubs init', + 'pubs list', + 'pubs import data/', + 'pubs list --ignore-case --strict title:lAnguage', + ] + outs = self.execute_cmds(cmds) + self.assertEqual(0 + 1, len(outs[-1].split('\n'))) + + def test_list_strict(self): + cmds = ['pubs init', + 'pubs list', + 'pubs import data/', + 'pubs list --strict title:{L}anguage', + ] + outs = self.execute_cmds(cmds) + self.assertEqual(0 + 1, len(outs[-1].split('\n'))) + + def test_list_latex_protection(self): + cmds = ['pubs init', + 'pubs list', + 'pubs import data/', + 'pubs list title:{L}anguage', + ] + outs = self.execute_cmds(cmds) + self.assertEqual(1 + 1, len(outs[-1].split('\n'))) + class TestTag(DataCommandTestCase):