barkov

package module
v1.0.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 25, 2025 License: MIT Imports: 6 Imported by: 0

README

Barkov

A simple markov chain generator.

Heavily inspired from https://github.com/jsvine/markovify.

Why?

The reason I made this library is because the markovify library was quite slow, and it did not give me enough control over the tokenization or the validation parts of the markov chain without me having to override the existing classes, which I found very annoying. For this reason, this implementation is quite barebones, and does not come with tokenization or validation code. You can choose to tokenize your text however you want, and validate a sentence in whichever way you see fit. If you don't want to use the chain struct that I've defined, and want to use your own, fine. There's a GenerativeChain interface you need to satisfy that has 2 getters and one function to output the next token, and you can use the most useful parts of this library.

Some advantages of this library over the original (in no particular order):

  • Much more memory efficient by default (doesn't store too many state variables, relies more on barebones maps and slices)
  • You don't need to override the default tokenizer, as there is no default tokenizer
  • You don't need to override the default validator, as there is no default validator
  • Implements tree pruning during markov generation, to allow for way more efficient generation
  • Uses goroutines to peform many generations at a time to allow for faster generation
  • Implements a timeout for generation functions that perform validation, allowing for bounded-time generation
  • All the useful functions are not written with some chain class in mind, but an interface, allowing for much more customizability

Features that aren't in this library (yet):

  • Combining models
  • Exporting and importing models to/from JSON

Usage

This is an exhaustive example for all features of this library.

package main

import (
	"fmt"
	"os"
	"strings"
	"time"

	"github.com/soumitradev/barkov"
)

const STATE_SIZE = 4
const MAX_SENTENCE_LEN = 100
const TIMEOUT = 10 * time.Second

func makeValidator(fullText string) func([]string) bool {
	original := fullText
	// Validator that checks if whatever was generated
	// so far was already in the original text, and
	// rejects if it is.
	return func(gram []string) bool {
		text := strings.Join(gram, " ")
		return !strings.Contains(original, text)
	}
}

func tokenize(messages []string) ([][]string, func([]string) bool) {
	corpus := make([][]string, 0, len(messages))
	var fullText strings.Builder

	for _, message := range messages {
		tokens := strings.Split(message, " ")
		// Filter out messages that are too long or too short
		if len(tokens) < STATE_SIZE || len(tokens) > MAX_SENTENCE_LEN {
			continue
		}

		// Filter out empty tokens that might exist due to multiple spaces
		filtered := make([]string, 0, len(tokens))
		for _, token := range tokens {
			if token == "" {
				continue
			}
			filtered = append(filtered, token)
		}

		corpus = append(corpus, filtered)
		fullText.WriteString(strings.Join(filtered, " ") + "\n")
	}

	return corpus, makeValidator(fullText.String())
}

func readLines(filepath string) []string {
	bytes, err := os.ReadFile(filepath)
	if err != nil {
		panic(fmt.Sprintf("Error reading file at %s", filepath))
	}

	return strings.Split(string(bytes), "\n")
}

func main() {
	filepath := "./corpus.txt"
	messages := readLines(filepath)
	corpus, validator := tokenize(messages)

	fmt.Println("Finished building corpus and context!")
	fmt.Printf("State Size: %d\n", STATE_SIZE)
	chain := barkov.InitChain(STATE_SIZE).Build(corpus).Compress()
	fmt.Println("Finished building and compiling markov model!")

	fmt.Println("Printing 5 random sentences first:")
	for range 5 {
		// Use the threaded version of the generation function with validator and timeout
		generated, err := barkov.GenThreaded(chain, validator, TIMEOUT)
		if err != nil {
			fmt.Println("[ERROR]", err)
			continue
		}
		fmt.Println(strings.Join(generated, " "))
	}

	fmt.Println("Printing 5 random sentences with start states:")
	for range 5 {
		start := barkov.ConstructState([]string{"i", "did", "not"})
		// You can even provide a start state
		generated, err := barkov.GenThreadedWithStart(chain, start, validator, TIMEOUT)
		if err != nil {
			fmt.Println("[ERROR]", err)
			continue
		}
		fmt.Println(strings.Join(generated, " "))
	}
}

Documentation

Index

Constants

View Source
const BEGIN = "</BEGIN/>"
View Source
const END = "</END/>"
View Source
const ErrGenerationTimeout = errorCause("sentence generation timed out")
View Source
const ErrSentenceFailedValidation = errorCause("sentence failed validation")
View Source
const ErrSentenceTooShort = errorCause("generated sentence too short")
View Source
const ErrStateNotFound = errorCause("state does not exist in model")
View Source
const SEP = "</SEP/>"

Variables

This section is empty.

Functions

func DeconstructState added in v1.0.1

func DeconstructState(state State) []string

func Gen added in v1.0.1

func Gen(chain GenerativeChain) ([]string, error)

func GenPruned added in v1.0.1

func GenPruned(
	chain GenerativeChain,
	validGram func([]string) bool,
) ([]string, error)

func GenPrunedWithStart added in v1.0.1

func GenPrunedWithStart(
	chain GenerativeChain,
	start State,
	validGram func([]string) bool,
) ([]string, error)

func GenThreaded added in v1.0.1

func GenThreaded(
	chain GenerativeChain,
	validGram func([]string) bool,
	timeout time.Duration,
) ([]string, error)

func GenThreadedWithStart added in v1.0.1

func GenThreadedWithStart(
	chain GenerativeChain,
	start State,
	validGram func([]string) bool,
	timeout time.Duration,
) ([]string, error)

func GenWithStart added in v1.0.1

func GenWithStart(chain GenerativeChain, start State) ([]string, error)

Types

type Chain added in v1.0.1

type Chain struct {
	Model Model
	// contains filtered or unexported fields
}

func InitChain added in v1.0.1

func InitChain(contextSize int) *Chain

func (*Chain) Build added in v1.0.1

func (chain *Chain) Build(corpus [][]string) *Chain

func (*Chain) Compress added in v1.0.1

func (chain *Chain) Compress() *CompressedChain

type CompressedChain added in v1.0.1

type CompressedChain struct {
	Model CompressedModel
	// contains filtered or unexported fields
}

type CompressedChoices added in v1.0.1

type CompressedChoices struct {
	CumDist []int
	Choices []string
}

type CompressedModel added in v1.0.1

type CompressedModel = map[State]CompressedChoices

type GenerativeChain added in v1.0.1

type GenerativeChain interface {
	// contains filtered or unexported methods
}

type Model added in v1.0.1

type Model = map[State]map[string]int

type Result added in v1.0.1

type Result struct {
	// contains filtered or unexported fields
}

type State added in v1.0.1

type State = string

func ConstructState added in v1.0.1

func ConstructState(state []string) State

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL