On Github leesdolphin / language-punctuation
from collections import Counter import os PYTHON_ROOT = '/usr/lib/python3.4/' symbol_counter = Counter() for dir_path, dir_names, files in os.walk(PYTHON_ROOT): for filename in files: if filename.endswith('.py'): with open(os.path.join(dir_path, filename), 'r') as f: symbol_counter.update(f.read())
display_counter(symbol_counter.most_common(5))
BORING_CHARACTERS = string.ascii_letters + ' ' + '\n' interesting_counter = symbol_counter.copy() for character in BORING_CHARACTERS: interesting_counter[character] = 0 interesting_chars = (+interesting_counter).most_common() interesting_chars_dict = dict(interesting_chars) display_counter(interesting_chars[:7], fmt=str)
fig = plt.figure(figsize=(14, 6)) x_chars, y_counts = list(zip(*interesting_chars[:12])) indexes = np.arange(len(x_chars)) plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig) plt.ylabel('Counts', fontsize=20) plt.title('Most common characters', fontsize=30) plt.xticks(indexes+width/2., x_chars, fontsize=50) plt.yticks(fontsize=25);
Image("keyboard_punctuation_keys.png")
fig = plt.figure(figsize=(14, 5));plt.hold('on') indexes = np.arange(10) x_nums = list(map(str, indexes)) y_counts = [interesting_chars_dict[num] for num in x_nums] plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig) plt.plot(indexes+width/2., np.log10(1 + 1 / indexes) * sum(y_counts[1:])) plt.ylabel('Counts', fontsize=20) plt.title('Is this your number?', fontsize=30) plt.xticks(indexes+width/2., x_nums, fontsize=50) plt.yticks(fontsize=25);
The line follows Benford's Law - Python sort of doesn't
interesting_chars_dict['\t']
4
non_ascii = set(interesting_chars_dict) - set(map(chr, range(0, 127))) len(non_ascii)
94
print(*non_ascii)
ø µ Ø È ë Ł ß Ù π Ð ô á Ö φ Ú ε ϱ Û σ ϑ Î ï À ṡ Ä ſ Ì Ç ΰ ſt õ â Ï ϰ í î Ô ÿ ç ρ Õ Á ΰ κ æ ϵ ü Ü β ı ΐ Æ ι Ã Í ì ϕ Å ã ú Ò ϖ Ë Ó Ý å ς ý ι ê μ Â ñ à Ñ û Ê θ ö é ù Þ st è ð ó ò ΐ ẛ þ É ϐ ä
Thank you to
for dir_path, dir_names, files in os.walk(PYTHON_ROOT): for filename in files: if filename.endswith('.py'): with open(os.path.join(dir_path, filename), 'r') as f: file_content = f.read() if '\t' in file_content: print(os.path.join(dir_path, filename)) for line_no, line in enumerate(file_content.splitlines()): if '\t' in line: print(line_no, ":", repr(line))
/usr/lib/python3.4/gettext.py 383 : '\t# first look into the standard locale dir, then into the ' 384 : '\t# langpack locale dir' 386 : '\t# standard mo file' 393 : '\t# langpack mofile -> use it'
fig = plt.figure(figsize=(4*8,3*8)) ## Skip space(idx: 0) and newline(idx: 9) x_chars, y_counts = list(zip(*(all_chars[1:9] + all_chars[10:97]))) indexes = np.arange(len(x_chars)) plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig) plt.ylabel('Counts', fontsize=20) plt.title('Most common characters', fontsize=30) plt.xticks(indexes+width/2., x_chars, fontsize=30) plt.yticks(fontsize=25); plt.xlim(min(indexes), max(indexes)) plt.yscale('log', nonposy='clip')