"""
ngrams.py
Adapted from code by Paul Ebreo
Last changed 2025-10-01 by Amir Zeldes
"""

from random import choice
from nltk import word_tokenize
import argparse


def get_counts(context_length, training_text):
    """
    This function counts the frequencies of all continuations of each context tuple
    :param context_length: Integer, number of tokens preceding current token (use 2 for trigrams)
    :param training_text: The training data as one big string
    :return: counts: A dictionary of context tuples to dictionaries of continuation counts
    """

    counts = {}

    tokens = word_tokenize(training_text)
    for i in range(len(tokens) - context_length):
        context = []
        next_token = tokens[i + context_length]

        for j in range(context_length):
            context.append(tokens[i + j])

        # Add 1 to frequency or create new dictionary item for this tuple
        if tuple(context) in counts:
            if next_token in counts[tuple(context)]:
                counts[tuple(context)][next_token] += 1
            else:
                counts[tuple(context)][next_token] = 1
        else:
            counts[tuple(context)] = {next_token: 1}

    return counts


def generate_from_file(context_length, training_file, output_length=60):

    # Open the training file
    with open(training_file, 'r') as f:
        training_data = f.read()

    counts = get_counts(context_length, training_data)

    first_tokens = choice(list(counts.keys()))  # Choose a random first context
    output_list = list(first_tokens)
    current_context = first_tokens

    for i in range(output_length):
        next_context = max(counts[current_context], key=(counts[current_context].get))
        temp = list(current_context)
        temp.pop(0)  # Remove first token in previous context
        temp.append(next_context)  # Add new token for the next context
        next_token = temp[-1]
        next_context = tuple(temp)

        current_context = next_context

        output_list.append(next_token)

    print(" ".join(output_list))


parser = argparse.ArgumentParser()
parser.add_argument("file")

options = parser.parse_args()

generate_from_file(2,options.file)