pierd.github.io


Project maintained by pierd Hosted on GitHub Pages — Theme by mattgraham
1 December 2019

Trainspeaking

“Trainspotting” by Irvine Welsh is partially written in Scottish. It makes it so hard to read that I decided to build an ngrams model and generate text based on it. Just a quick experiment, you can find it here.

The model was built using something like this:

#!/usr/bin/env python3

import re
import json

SIZE = 5

def flatten(dct):
    result = {}
    for k, v in dct.items():
        d = result
        for i in k[:-1]:
            d[i] = d.get(i, {})
            d = d[i]
        d[k[-1]] = v
    return result

with open('book.txt', 'rt') as input_file:
    data = input_file.read()
    ngrams = {}
    window = ('.',)
    for word in re.findall(r'[’\w]+|\.|\?|,', data):
        if len(word) > 10 or word == '’':
            continue
        word = word.lower()
        if len(window) == SIZE - 1:
            ngrams[window] = ngrams.get(window, {})
            ngrams[window][word] = ngrams[window].get(word, 0) + 1
            window = window[1:] + (word,)
        else:
            window = window + (word,)

    print(json.dumps(flatten(ngrams)))

The generation itself is also very simple:

function iter_tree(tree, fun, current) {
    current = current || [];
    for (let [key, value] of Object.entries(tree)) {
        current.push(key);
        if (Number.isInteger(value)) {
            result = fun(current, value);
            if (result !== undefined) {
                return result;
            }
        } else {
            result = iter_tree(value, fun, current);
            if (result !== undefined) {
                return result;
            }
        }
        current.pop();
    }
}

function random_word_from_tree(tree) {
    sum = 0;
    iter_tree(tree, function(path, count) {
        sum += count;
    });
    selected = Math.floor(Math.random() * sum);
    result = iter_tree(tree, function(path, count) {
        selected -= count;
        if (selected < 0) {
            return path[0];
        }
    });
    return result;
}

function random_word(prefix) {
    tree = ngrams;
    prefix.forEach(function(v) {
        tree = tree[v];
    })
    result = random_word_from_tree(tree);
    return result;
}

function generate(size, length) {
    text = ['.'];
    while (text.length <= size - 1) {
        text.push(random_word(text));
    }
    while (text.length <= length || (text[text.length - 1] != '.' && text[text.length - 1] != '?')) {
        text.push(random_word(text.slice(-(size - 1))));
    }
    while (text[0] == '.' || text[0] == '?') {
        text = text.slice(1);   // drop the initial dots
    }
    text = text.join(' ');
    text = text.replace(/ ,/g, ',');  // remove space from before ,
    text = text.replace(/ \./g, '.');  // remove space from before .
    text = text.replace(/ \?/g, '?');  // remove space from before ?
    return text;
}
tags: trainspotting - ngrams