#!/usr/bin/python import os import sys import codecs import operator from unidecode import unidecode def usage(): return ''' This script extracts words and counts from a 2006 wiktionary word frequency study over American television and movies. To use, first visit the study and download, as .html files, all 26 of the frequency lists: https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts Put those into a single directory and point it to this script: %s wiktionary_html_dir ../data/us_tv_and_film.txt output.txt will include one line per word in the study, ordered by rank, of the form: word1 count1 word2 count2 ... ''' % sys.argv[0] def parse_wiki_tokens(html_doc_str): '''fragile hax, but checks the result at the end''' results = [] last3 = ['', '', ''] header = True skipped = 0 for line in html_doc_str.split('\n'): last3.pop(0) last3.append(line.strip()) if all(s.startswith('') and not s == '' for s in last3): if header: header = False continue last3 = [s.replace('', '').replace('', '').strip() for s in last3] rank, token, count = last3 rank = int(rank.split()[0]) token = token.replace('', '') token = token[token.index('>')+1:] token = normalize(token) # wikitonary has thousands of words that end in 's # keep the common ones (rank under 1000), discard the rest # # otherwise end up with a bunch of duplicates eg victor / victor's if token.endswith("'s") and rank > 1000: skipped += 1 continue count = int(count) results.append((rank, token, count)) # early docs have 1k entries, later 2k, last 1284 assert len(results) + skipped in [1000, 2000, 1284] return results def normalize(token): return unidecode(token).lower() def main(wiktionary_html_root, output_filename): rank_token_count = [] # list of 3-tuples for filename in os.listdir(wiktionary_html_root): path = os.path.join(wiktionary_html_root, filename) with codecs.open(path, 'r', 'utf8') as f: rank_token_count.extend(parse_wiki_tokens(f.read())) rank_token_count.sort(key=operator.itemgetter(0)) with codecs.open(output_filename, 'w', 'utf8') as f: for rank, token, count in rank_token_count: f.write('%-18s %d\n' % (token, count)) if __name__ == '__main__': if len(sys.argv) != 3: print usage() else: main(*sys.argv[1:]) sys.exit(0)