#!/usr/bin/env python
# -*- coding: utf8 -*-
#
# VCV v0.1a, placed in the public domain
#
# $Id: VCV.py 286 2006-08-07 18:26:26Z taliesin $
#
# TODO:
# - maybe remove words that are a single vcv, like #if# and #else#
# - userfiendliness
#

import sys
import re
import random


class VCV(object):
    punct = re.compile('[\s\W\d_]+', re.U)

    def __init__(self, vowels='aeiouy', encoding='utf8'):
        v = vowels.decode(encoding)
        self.vowels = v
        self.vowelpattern = re.compile(u'[%s]' % v, re.U)
        self.vcvpattern = re.compile(u'([#%s]+)([^#%s]+)([#%s]+)' % (v,
            v, v), re.U)
        self.encoding = encoding

    def _find(self, word):
        if not self.vowelpattern.findall(word): return
        vcv1 = self.vcvpattern.findall(word)
        if not vcv1:
            return set()
        vcv = set(vcv1)
        vcv2 = self.vcvpattern.findall(word[len(vcv1[0][0]):])
        vcv.update(set(vcv2))
        return vcv

    def get_from_file(self, File, encoding='utf8'):
        words = {}
        for line in File:
            if not line.strip(): continue
            line =  self.punct.sub(u' ', unicode(line.strip().lower(), encoding))
            for word in line.split(u' '):
                if not word or len(word) < 2: continue
                else:
                    vcvs = self._find(u'%s' % word)
                    if not vcvs: continue
                    for vcv in vcvs:
                        words.setdefault(vcv[0], set())
                        words[vcv[0]].add(vcv[1:])
        self.patterns = words
        return words

    def showPatterns(self):
        keys = sorted(self.patterns.keys())
        for k in keys:
            print k.encode(self.encoding)
            for v in self.patterns[k]:
                line = u'\t%s%s' % (k, u''.join(v))
                print line.encode(self.encoding)

    def makeWord(self, vcvs, word, maxlen=3):
        """vcvs: {'V*': (('K*', 'V*'), ..))
        word: ['V*', 'K*', 'V*', ..]"""
        for i in range(maxlen):
            if not word:
                break
            lastv = word[-1]
            if lastv in vcvs:
                word.extend(random.choice(vcvs[lastv]))
            else:
                word.append('$')
                break
            if word[-1][-1] == '#':
                break
        return word

    def generate(self, num=5, maxlen=4):
        vcvs = self.patterns
        startV = [(k, tuple(v)) for k, v in vcvs.iteritems() 
                if k[0] == '#']
        otherV = dict([(k, tuple(v)) for k, v in vcvs.iteritems()
                if k[0] != '#'])
        for i in range(num):
            word = []
            vcv1 = random.choice(startV)
            word.append(vcv1[0])
            cv1 = random.choice(vcv1[1])
            word.extend(cv1)
            if word[-1][-1] == '#':
                print u''.join(word).strip('#').encode(self.encoding)
                continue
            print u''.join(self.makeWord(otherV, word, maxlen
                -1)).encode(self.encoding)

if __name__ == '__main__':
    vcv = VCV(vowels='aeiouyæøå')
    vcv.get_from_file(sys.stdin, 'utf8')
    vcv.generate(10, 6)
