__init__.py 1.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
# coding: utf-8


import MeCab
import jaconv
from random import random

n_vocab = 0xffff

_eos = 1
_pad = 0
_tagger = None


def _yomi(mecab_result):
    tokens = []
    yomis = []
    for line in mecab_result.split("\n")[:-1]:
        s = line.split("\t")
        if len(s) == 1:
            break
        token, rest = s
        rest = rest.split(",")
        tokens.append(token)
        yomi = rest[7] if len(rest) > 7 else None
        yomi = None if yomi == "*" else yomi
        yomis.append(yomi)

    return tokens, yomis


def _mix_pronunciation(tokens, yomis, p):
    return "".join(
        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
        for idx in range(len(tokens)))


def mix_pronunciation(text, p):
    global _tagger
    if _tagger is None:
        _tagger = MeCab.Tagger("")
    tokens, yomis = _yomi(_tagger.parse(text))
    return _mix_pronunciation(tokens, yomis, p)


def add_punctuation(text):
    last = text[-1]
    if last not in [".", ",", "、", "。", "!", "?", "!", "?"]:
        text = text + "。"
    return text


def normalize_delimitor(text):
    text = text.replace(",", "、")
    text = text.replace(".", "。")
    text = text.replace(",", "、")
    text = text.replace(".", "。")
    return text


def text_to_sequence(text, p=0.0):
    for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
              "(", ")", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "!")
    text = text.replace("?", "?")

    text = normalize_delimitor(text)
    text = jaconv.normalize(text)
    if p > 0:
        text = mix_pronunciation(text, p)
    text = jaconv.hira2kata(text)
    text = add_punctuation(text)

    return [ord(c) for c in text] + [_eos]  # EOS


def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)