""" ngrams.py Adapted from code by Paul Ebreo Last changed 2025-10-01 by Amir Zeldes """ from random import choice from nltk import word_tokenize import argparse def get_counts(context_length, training_text): """ This function counts the frequencies of all continuations of each context tuple :param context_length: Integer, number of tokens preceding current token (use 2 for trigrams) :param training_text: The training data as one big string :return: counts: A dictionary of context tuples to dictionaries of continuation counts """ counts = {} tokens = word_tokenize(training_text) for i in range(len(tokens) - context_length): context = [] next_token = tokens[i + context_length] for j in range(context_length): context.append(tokens[i + j]) # Add 1 to frequency or create new dictionary item for this tuple if tuple(context) in counts: if next_token in counts[tuple(context)]: counts[tuple(context)][next_token] += 1 else: counts[tuple(context)][next_token] = 1 else: counts[tuple(context)] = {next_token: 1} return counts def generate_from_file(context_length, training_file, output_length=60): # Open the training file with open(training_file, 'r') as f: training_data = f.read() counts = get_counts(context_length, training_data) first_tokens = choice(list(counts.keys())) # Choose a random first context output_list = list(first_tokens) current_context = first_tokens for i in range(output_length): next_context = max(counts[current_context], key=(counts[current_context].get)) temp = list(current_context) temp.pop(0) # Remove first token in previous context temp.append(next_context) # Add new token for the next context next_token = temp[-1] next_context = tuple(temp) current_context = next_context output_list.append(next_token) print(" ".join(output_list)) parser = argparse.ArgumentParser() parser.add_argument("file") options = parser.parse_args() generate_from_file(2,options.file)