From e9fbd7463c80ee84a1d0a535612c3e03d817e3ac Mon Sep 17 00:00:00 2001 From: Silgard Date: Sat, 22 Feb 2020 12:56:44 -0800 Subject: [PATCH 1/3] test coverage for optionally maintaining case --- tests/test_coverage.py | 6 ++++++ wordsegment/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 4dccf48..0ee5c26 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -102,3 +102,9 @@ def test_words(): assert len(WORDS) > 0 assert WORDS[0] == 'aa' assert WORDS[-1] == 'zzz' + +def test_keep_case(): + assert segment('\tMaintainCasing \nwith variableSpaCING', True) \ + == ['Maintain', 'Casing', 'with', 'variable', 'SpaCING'] + assert segment('\tMaintainCasing \nwith variableSpaCING', False) \ + == ['maintain', 'casing', 'with', 'variable', 'spacing'] \ No newline at end of file diff --git a/wordsegment/__init__.py b/wordsegment/__init__.py index 1db0776..86af53c 100644 --- a/wordsegment/__init__.py +++ b/wordsegment/__init__.py @@ -162,7 +162,7 @@ def candidates(): yield word - def segment(self, text): + def segment(self, text, keep_case=False): "Return list of words that is the best segmenation of `text`." return list(self.isegment(text)) From 69eb2c4594fe286a4558dcc3979aba8ca7438268 Mon Sep 17 00:00:00 2001 From: Silgard Date: Sat, 22 Feb 2020 13:03:17 -0800 Subject: [PATCH 2/3] optional arg for maintaining case --- wordsegment/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wordsegment/__init__.py b/wordsegment/__init__.py index 86af53c..c1e9124 100644 --- a/wordsegment/__init__.py +++ b/wordsegment/__init__.py @@ -207,12 +207,14 @@ def main(arguments=()): default=sys.stdin) parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) - + parser.add_argument('--keep_case', action='store_true', default=False, + help='maintain original case of input text') + streams = parser.parse_args(arguments) load() for line in iter(streams.infile.readline, ''): - streams.outfile.write(' '.join(segment(line.strip()))) + streams.outfile.write(' '.join(segment(line.strip(), streams.keep_case))) streams.outfile.write(os.linesep) From 9bb5dbb1380996e77664e2b6d43bc5661ae57fd7 Mon Sep 17 00:00:00 2001 From: Silgard Date: Sat, 22 Feb 2020 13:10:14 -0800 Subject: [PATCH 3/3] maintain case by refering back to input text --- wordsegment/__init__.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/wordsegment/__init__.py b/wordsegment/__init__.py index c1e9124..e1c99f0 100644 --- a/wordsegment/__init__.py +++ b/wordsegment/__init__.py @@ -30,6 +30,7 @@ import math import os.path as op import sys +import re class Segmenter(object): @@ -164,9 +165,24 @@ def candidates(): def segment(self, text, keep_case=False): "Return list of words that is the best segmenation of `text`." + if keep_case: + return self.maintain_case(text, list(self.isegment(text))) return list(self.isegment(text)) - + def maintain_case(self, orig_text, seg_text): + "maintain the characters casing by referring back to `orig_text`." + cased_text = [] + og_char_i = 0 + for tok_i in range(len(seg_text)): + cased_token = list(seg_text[tok_i]) + for char_i in range(len(cased_token)): + while re.match('[\s]',orig_text[og_char_i]): + og_char_i += 1 + cased_token[char_i] = orig_text[og_char_i] + og_char_i += 1 + cased_text.append(''.join(cased_token)) + return cased_text + def divide(self, text): "Yield `(prefix, suffix)` pairs from `text`." for pos in range(1, min(len(text), self.limit) + 1): @@ -209,7 +225,7 @@ def main(arguments=()): default=sys.stdout) parser.add_argument('--keep_case', action='store_true', default=False, help='maintain original case of input text') - + streams = parser.parse_args(arguments) load()