diff --git a/src/textcode/gibberish.py b/src/textcode/gibberish.py new file mode 100644 index 00000000000..ff6034c440f --- /dev/null +++ b/src/textcode/gibberish.py @@ -0,0 +1,135 @@ +#!/usr/bin/python +# +# From: https://raw.githubusercontent.com/yapus/gibberish/01637fe1fda827529ca76b8d6fee2de9100719f1/gibberish/gibberish.py +# +# 12Jun2017 Petr Janata - added srcfile and outfile +# 17Jun2107 Petr Janata - expanded set of accepted characters to include digits and hyphen +# +# whch is based off of: +# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py +# + +import math +import pickle +from pathlib import Path + +data_dir = Path(__file__).parent / 'data' / 'gibberish' +model_path = data_dir / 'gib_model.pki' +big_file_path = data_dir / 'big.txt' +good_file_path = data_dir / 'good.txt' +bad_file_path = data_dir / 'bad.txt' + +accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- ' +pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)]) + + +class Gibberish(object): + def __init__(self): + if model_path.exists(): + self.load_persisted_model() + else: + self.train() + + def persist_model(self): + with open(model_path, mode='wb') as f: + pickle.dump(vars(self), f) + + def load_persisted_model(self): + with open(model_path, mode='rb') as f: + persisted_model = pickle.load(f) + for key, value in persisted_model.items(): + setattr(self, key, value) + + def normalize(self, line): + """ Return only the subset of chars from accepted_chars. + This helps keep the model relatively small by ignoring punctuation, + infrequenty symbols, etc. """ + return [c.lower() for c in line if c.lower() in accepted_chars] + + def ngram(self, n, l): + """ Return all n grams from l after normalizing """ + filtered = self.normalize(l) + for start in range(0, len(filtered) - n + 1): + yield ''.join(filtered[start:start + n]) + + def avg_transition_prob(self, l, log_prob_mat): + """ Return the average transition prob from l through log_prob_mat. """ + log_prob = 0.0 + transition_ct = 0 + for a, b in self.ngram(2, l): + log_prob += log_prob_mat[pos[a]][pos[b]] + transition_ct += 1 + # The exponentiation translates from log probs to probs. + return math.exp(log_prob / (transition_ct or 1)) + + def train(self, bigfile=big_file_path, goodfile=good_file_path, + badfile=bad_file_path): + """ Write a simple model as a pickle file """ + k = len(accepted_chars) + # Assume we have seen 10 of each character pair. This acts as a kind of + # prior or smoothing factor. This way, if we see a character transition + # live that we've never observed in the past, we won't assume the entire + # string has 0 probability. + counts = [[10 for i in range(k)] for i in range(k)] + + # Count transitions from big text file, taken + # from http://norvig.com/spell-correct.html + for line in open(bigfile, encoding='utf-8'): + for a, b in self.ngram(2, line): + counts[pos[a]][pos[b]] += 1 + + # Normalize the counts so that they become log probabilities. + # We use log probabilities rather than straight probabilities to avoid + # numeric underflow issues with long texts. + # This contains a justification: + # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/ + for i, row in enumerate(counts): + s = float(sum(row)) + for j in range(len(row)): + row[j] = math.log(row[j] / s) + + # Find the probability of generating a few arbitrarily choosen good and + # bad phrases. + good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile, encoding='utf-8')] + bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile, encoding='utf-8')] + + # Assert that we actually are capable of detecting the junk. + assert min(good_probs) > max(bad_probs) + + # And pick a threshold halfway between the worst good and best bad inputs. + thresh = (min(good_probs) + max(bad_probs)) / 2 + self.mat = counts + self.thresh = thresh + self.persist_model() + + def detect_gibberish(self, text): + COPYRIGHT_INDICATORS = ( + 'copyright', '(c)', 'c)', '©', '@copyright', + 'author:', 'commit', 'portions:', 'rights reserved', + '(p)', 'trademark', 'intellectual property' + ) + + text_lower = text.lower() + if any(indicator in text_lower for indicator in COPYRIGHT_INDICATORS): + return False + + text_normalized = ''.join(self.normalize(text)) + return self.avg_transition_prob(text_normalized, self.mat) < self.thresh + + def percent_gibberish(self, text): + text = ''.join(self.normalize(text)) + text = text.strip() + words = text.split(' ') + if len(words) == 0: + return 0 + + gibberish_count = 0 + for word in words: + if self.detect_gibberish(word): + gibberish_count += 1 + + return float(gibberish_count) / float(len(words)) + + def gibberish_pct(self, text): + text = ''.join(self.normalize(text)) + return self.avg_transition_prob(text, self.mat)