未验证 提交 1aa7495d 编写于 作者: 小湉湉's avatar 小湉湉 提交者: GitHub

[TTS]Add license and reformat for TTSCppFrontend (#3030)

上级 259f4936
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm> #include <algorithm>
#include <chrono> #include <chrono>
#include <iostream>
#include <fstream> #include <fstream>
#include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -10,24 +23,28 @@ ...@@ -10,24 +23,28 @@
using namespace paddle::lite_api; using namespace paddle::lite_api;
class PredictorInterface { class PredictorInterface {
public: public:
virtual ~PredictorInterface() = 0; virtual ~PredictorInterface() = 0;
virtual bool Init( virtual bool Init(const std::string &AcousticModelPath,
const std::string &AcousticModelPath, const std::string &VocoderPath,
const std::string &VocoderPath, PowerMode cpuPowerMode,
PowerMode cpuPowerMode, int cpuThreadNum,
int cpuThreadNum, // WAV采样率(必须与模型输出匹配)
// WAV采样率(必须与模型输出匹配) // 如果播放速度和音调异常,请修改采样率
// 如果播放速度和音调异常,请修改采样率 // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
// 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 uint32_t wavSampleRate) = 0;
uint32_t wavSampleRate virtual std::shared_ptr<PaddlePredictor> LoadModel(
) = 0; const std::string &modelPath,
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) = 0; int cpuThreadNum,
PowerMode cpuPowerMode) = 0;
virtual void ReleaseModel() = 0; virtual void ReleaseModel() = 0;
virtual bool RunModel(const std::vector<int64_t> &phones) = 0; virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) = 0; virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) = 0; const std::vector<int64_t> &phones) = 0;
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) = 0; virtual std::unique_ptr<const Tensor> GetVocoderOutput(
std::unique_ptr<const Tensor> &&amOutput) = 0;
virtual void VocoderOutputToWav(
std::unique_ptr<const Tensor> &&vocOutput) = 0;
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0; virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
virtual bool IsLoaded() = 0; virtual bool IsLoaded() = 0;
virtual float GetInferenceTime() = 0; virtual float GetInferenceTime() = 0;
...@@ -45,23 +62,22 @@ PredictorInterface::~PredictorInterface() {} ...@@ -45,23 +62,22 @@ PredictorInterface::~PredictorInterface() {}
// WavDataType: WAV数据类型 // WavDataType: WAV数据类型
// 可在 int16_t 和 float 之间切换, // 可在 int16_t 和 float 之间切换,
// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV // 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
template<typename WavDataType> template <typename WavDataType>
class Predictor : public PredictorInterface { class Predictor : public PredictorInterface {
public: public:
virtual bool Init( bool Init(const std::string &AcousticModelPath,
const std::string &AcousticModelPath, const std::string &VocoderPath,
const std::string &VocoderPath, PowerMode cpuPowerMode,
PowerMode cpuPowerMode, int cpuThreadNum,
int cpuThreadNum, // WAV采样率(必须与模型输出匹配)
// WAV采样率(必须与模型输出匹配) // 如果播放速度和音调异常,请修改采样率
// 如果播放速度和音调异常,请修改采样率 // 常见采样率:16000, 24000, 32000, 44100, 48000, 96000
// 常见采样率:16000, 24000, 32000, 44100, 48000, 96000 uint32_t wavSampleRate) override {
uint32_t wavSampleRate
) override {
// Release model if exists // Release model if exists
ReleaseModel(); ReleaseModel();
acoustic_model_predictor_ = LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode); acoustic_model_predictor_ =
LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
if (acoustic_model_predictor_ == nullptr) { if (acoustic_model_predictor_ == nullptr) {
return false; return false;
} }
...@@ -80,7 +96,10 @@ public: ...@@ -80,7 +96,10 @@ public:
ReleaseWav(); ReleaseWav();
} }
virtual std::shared_ptr<PaddlePredictor> LoadModel(const std::string &modelPath, int cpuThreadNum, PowerMode cpuPowerMode) override { std::shared_ptr<PaddlePredictor> LoadModel(
const std::string &modelPath,
int cpuThreadNum,
PowerMode cpuPowerMode) override {
if (modelPath.empty()) { if (modelPath.empty()) {
return nullptr; return nullptr;
} }
...@@ -94,12 +113,12 @@ public: ...@@ -94,12 +113,12 @@ public:
return CreatePaddlePredictor<MobileConfig>(config); return CreatePaddlePredictor<MobileConfig>(config);
} }
virtual void ReleaseModel() override { void ReleaseModel() override {
acoustic_model_predictor_ = nullptr; acoustic_model_predictor_ = nullptr;
vocoder_predictor_ = nullptr; vocoder_predictor_ = nullptr;
} }
virtual bool RunModel(const std::vector<int64_t> &phones) override { bool RunModel(const std::vector<int64_t> &phones) override {
if (!IsLoaded()) { if (!IsLoaded()) {
return false; return false;
} }
...@@ -115,12 +134,13 @@ public: ...@@ -115,12 +134,13 @@ public:
// 计算用时 // 计算用时
std::chrono::duration<float> duration = end - start; std::chrono::duration<float> duration = end - start;
inference_time_ = duration.count() * 1000; // 单位:毫秒 inference_time_ = duration.count() * 1000; // 单位:毫秒
return true; return true;
} }
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(const std::vector<int64_t> &phones) override { std::unique_ptr<const Tensor> GetAcousticModelOutput(
const std::vector<int64_t> &phones) override {
auto phones_handle = acoustic_model_predictor_->GetInput(0); auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast<int64_t>(phones.size())}); phones_handle->Resize({static_cast<int64_t>(phones.size())});
phones_handle->CopyFromCpu(phones.data()); phones_handle->CopyFromCpu(phones.data());
...@@ -139,7 +159,8 @@ public: ...@@ -139,7 +159,8 @@ public:
return am_output_handle; return am_output_handle;
} }
virtual std::unique_ptr<const Tensor> GetVocoderOutput(std::unique_ptr<const Tensor> &&amOutput) override { std::unique_ptr<const Tensor> GetVocoderOutput(
std::unique_ptr<const Tensor> &&amOutput) override {
auto mel_handle = vocoder_predictor_->GetInput(0); auto mel_handle = vocoder_predictor_->GetInput(0);
// [?, 80] // [?, 80]
auto dims = amOutput->shape(); auto dims = amOutput->shape();
...@@ -161,7 +182,8 @@ public: ...@@ -161,7 +182,8 @@ public:
return voc_output_handle; return voc_output_handle;
} }
virtual void VocoderOutputToWav(std::unique_ptr<const Tensor> &&vocOutput) override { void VocoderOutputToWav(
std::unique_ptr<const Tensor> &&vocOutput) override {
// 获取输出Tensor的数据 // 获取输出Tensor的数据
int64_t output_size = 1; int64_t output_size = 1;
for (auto dim : vocOutput->shape()) { for (auto dim : vocOutput->shape()) {
...@@ -172,39 +194,31 @@ public: ...@@ -172,39 +194,31 @@ public:
SaveFloatWav(output_data, output_size); SaveFloatWav(output_data, output_size);
} }
virtual void SaveFloatWav(float *floatWav, int64_t size) override; void SaveFloatWav(float *floatWav, int64_t size) override;
virtual bool IsLoaded() override { bool IsLoaded() override {
return acoustic_model_predictor_ != nullptr && vocoder_predictor_ != nullptr; return acoustic_model_predictor_ != nullptr &&
vocoder_predictor_ != nullptr;
} }
virtual float GetInferenceTime() override { float GetInferenceTime() override { return inference_time_; }
return inference_time_;
}
const std::vector<WavDataType> & GetWav() { const std::vector<WavDataType> &GetWav() { return wav_; }
return wav_;
}
virtual int GetWavSize() override { int GetWavSize() override { return wav_.size() * sizeof(WavDataType); }
return wav_.size() * sizeof(WavDataType);
}
// 获取WAV持续时间(单位:毫秒) // 获取WAV持续时间(单位:毫秒)
virtual float GetWavDuration() override { float GetWavDuration() override {
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) / static_cast<float>(wav_sample_rate_) * 1000; return static_cast<float>(GetWavSize()) / sizeof(WavDataType) /
static_cast<float>(wav_sample_rate_) * 1000;
} }
// 获取RTF(合成时间 / 音频时长) // 获取RTF(合成时间 / 音频时长)
virtual float GetRTF() override { float GetRTF() override { return GetInferenceTime() / GetWavDuration(); }
return GetInferenceTime() / GetWavDuration();
}
virtual void ReleaseWav() override { void ReleaseWav() override { wav_.clear(); }
wav_.clear();
}
virtual bool WriteWavToFile(const std::string &wavPath) override { bool WriteWavToFile(const std::string &wavPath) override {
std::ofstream fout(wavPath, std::ios::binary); std::ofstream fout(wavPath, std::ios::binary);
if (!fout.is_open()) { if (!fout.is_open()) {
return false; return false;
...@@ -216,18 +230,20 @@ public: ...@@ -216,18 +230,20 @@ public:
header.data_size = GetWavSize(); header.data_size = GetWavSize();
header.size = sizeof(header) - 8 + header.data_size; header.size = sizeof(header) - 8 + header.data_size;
header.sample_rate = wav_sample_rate_; header.sample_rate = wav_sample_rate_;
header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8; header.byte_rate = header.sample_rate * header.num_channels *
header.bits_per_sample / 8;
header.block_align = header.num_channels * header.bits_per_sample / 8; header.block_align = header.num_channels * header.bits_per_sample / 8;
fout.write(reinterpret_cast<const char*>(&header), sizeof(header)); fout.write(reinterpret_cast<const char *>(&header), sizeof(header));
// 写入wav数据 // 写入wav数据
fout.write(reinterpret_cast<const char*>(wav_.data()), header.data_size); fout.write(reinterpret_cast<const char *>(wav_.data()),
header.data_size);
fout.close(); fout.close();
return true; return true;
} }
protected: protected:
struct WavHeader { struct WavHeader {
// RIFF 头 // RIFF 头
char riff[4] = {'R', 'I', 'F', 'F'}; char riff[4] = {'R', 'I', 'F', 'F'};
...@@ -250,19 +266,17 @@ protected: ...@@ -250,19 +266,17 @@ protected:
}; };
enum WavAudioFormat { enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式 WAV_FORMAT_16BIT_PCM = 1, // 16-bit PCM 格式
WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式 WAV_FORMAT_32BIT_FLOAT = 3 // 32-bit IEEE float 格式
}; };
protected: protected:
// 返回值通过模板特化由 WavDataType 决定 // 返回值通过模板特化由 WavDataType 决定
inline uint16_t GetWavAudioFormat(); inline uint16_t GetWavAudioFormat();
inline float Abs(float number) { inline float Abs(float number) { return (number < 0) ? -number : number; }
return (number < 0) ? -number : number;
}
protected: protected:
float inference_time_ = 0; float inference_time_ = 0;
uint32_t wav_sample_rate_ = 0; uint32_t wav_sample_rate_ = 0;
std::vector<WavDataType> wav_; std::vector<WavDataType> wav_;
...@@ -270,36 +284,36 @@ protected: ...@@ -270,36 +284,36 @@ protected:
std::shared_ptr<PaddlePredictor> vocoder_predictor_ = nullptr; std::shared_ptr<PaddlePredictor> vocoder_predictor_ = nullptr;
}; };
template<> template <>
uint16_t Predictor<int16_t>::GetWavAudioFormat() { uint16_t Predictor<int16_t>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_16BIT_PCM; return Predictor::WAV_FORMAT_16BIT_PCM;
} }
template<> template <>
uint16_t Predictor<float>::GetWavAudioFormat() { uint16_t Predictor<float>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_32BIT_FLOAT; return Predictor::WAV_FORMAT_32BIT_FLOAT;
} }
// 保存 16-bit PCM 格式 WAV // 保存 16-bit PCM 格式 WAV
template<> template <>
void Predictor<int16_t>::SaveFloatWav(float *floatWav, int64_t size) { void Predictor<int16_t>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size); wav_.resize(size);
float maxSample = 0.01; float maxSample = 0.01;
// 寻找最大采样值 // 寻找最大采样值
for (int64_t i=0; i<size; i++) { for (int64_t i = 0; i < size; i++) {
float sample = Abs(floatWav[i]); float sample = Abs(floatWav[i]);
if (sample > maxSample) { if (sample > maxSample) {
maxSample = sample; maxSample = sample;
} }
} }
// 把采样值缩放到 int_16 范围 // 把采样值缩放到 int_16 范围
for (int64_t i=0; i<size; i++) { for (int64_t i = 0; i < size; i++) {
wav_[i] = floatWav[i] * 32767.0f / maxSample; wav_[i] = floatWav[i] * 32767.0f / maxSample;
} }
} }
// 保存 32-bit IEEE float 格式 WAV // 保存 32-bit IEEE float 格式 WAV
template<> template <>
void Predictor<float>::SaveFloatWav(float *floatWav, int64_t size) { void Predictor<float>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size); wav_.resize(size);
std::copy_n(floatWav, size, wav_.data()); std::copy_n(floatWav, size, wav_.data());
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <front/front_interface.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <paddle_api.h>
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <map>
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <paddle_api.h>
#include <front/front_interface.h>
#include "Predictor.hpp" #include "Predictor.hpp"
using namespace paddle::lite_api; using namespace paddle::lite_api;
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized (Chinese only. English will crash the program.)"); DEFINE_string(
sentence,
"你好,欢迎使用语音合成服务",
"Text to be synthesized (Chinese only. English will crash the program.)");
DEFINE_string(front_conf, "./front.conf", "Front configuration file"); DEFINE_string(front_conf, "./front.conf", "Front configuration file");
DEFINE_string(acoustic_model, "./models/cpu/fastspeech2_csmsc_arm.nb", "Acoustic model .nb file"); DEFINE_string(acoustic_model,
DEFINE_string(vocoder, "./models/cpu/fastspeech2_csmsc_arm.nb", "vocoder .nb file"); "./models/cpu/fastspeech2_csmsc_arm.nb",
"Acoustic model .nb file");
DEFINE_string(vocoder,
"./models/cpu/fastspeech2_csmsc_arm.nb",
"vocoder .nb file");
DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file"); DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
DEFINE_string(wav_bit_depth, "16", "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)"); DEFINE_string(wav_bit_depth,
DEFINE_string(wav_sample_rate, "24000", "WAV sample rate, should match the output of the vocoder"); "16",
"WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
DEFINE_string(wav_sample_rate,
"24000",
"WAV sample rate, should match the output of the vocoder");
DEFINE_string(cpu_thread, "1", "CPU thread numbers"); DEFINE_string(cpu_thread, "1", "CPU thread numbers");
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
...@@ -53,7 +78,7 @@ int main(int argc, char *argv[]) { ...@@ -53,7 +78,7 @@ int main(int argc, char *argv[]) {
// 繁体转简体 // 繁体转简体
std::wstring sentence_simp; std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp); front_inst->Trand2Simp(ws_sentence, &sentence_simp);
ws_sentence = sentence_simp; ws_sentence = sentence_simp;
std::string s_sentence; std::string s_sentence;
...@@ -63,28 +88,30 @@ int main(int argc, char *argv[]) { ...@@ -63,28 +88,30 @@ int main(int argc, char *argv[]) {
// 根据标点进行分句 // 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation"; LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part); front_inst->SplitByPunc(ws_sentence, &sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully"; LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id // 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence"; LOG(INFO)
for(int i = 0; i < sentence_part.size(); i++) { << "Start to get the phoneme and tone id sequence of each sentence";
for (int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]); LOG(INFO) << "Raw sentence is: "
front_inst->SentenceNormalize(sentence_part[i]); << ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(&sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence; LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) { if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1; return -1;
} }
} }
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " "); LOG(INFO) << "The phoneids of the sentence is: "
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " "); << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: "
<< limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
/////////////////////////// 后端:音素转音频 /////////////////////////// /////////////////////////// 后端:音素转音频 ///////////////////////////
...@@ -99,13 +126,19 @@ int main(int argc, char *argv[]) { ...@@ -99,13 +126,19 @@ int main(int argc, char *argv[]) {
// CPU电源模式 // CPU电源模式
const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH; const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
if (!predictor->Init(FLAGS_acoustic_model, FLAGS_vocoder, cpuPowerMode, cpuThreadNum, wavSampleRate)) { if (!predictor->Init(FLAGS_acoustic_model,
FLAGS_vocoder,
cpuPowerMode,
cpuThreadNum,
wavSampleRate)) {
LOG(ERROR) << "predictor init failed" << std::endl; LOG(ERROR) << "predictor init failed" << std::endl;
return -1; return -1;
} }
std::vector<int64_t> phones(phoneids.size()); std::vector<int64_t> phones(phoneids.size());
std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) { return static_cast<int64_t>(x); }); std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) {
return static_cast<int64_t>(x);
});
if (!predictor->RunModel(phones)) { if (!predictor->RunModel(phones)) {
LOG(ERROR) << "predictor run model failed" << std::endl; LOG(ERROR) << "predictor run model failed" << std::endl;
...@@ -113,7 +146,8 @@ int main(int argc, char *argv[]) { ...@@ -113,7 +146,8 @@ int main(int argc, char *argv[]) {
} }
LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, " LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
<< "WAV size (without header): " << predictor->GetWavSize() << " bytes, " << "WAV size (without header): " << predictor->GetWavSize()
<< " bytes, "
<< "WAV duration: " << predictor->GetWavDuration() << " ms, " << "WAV duration: " << predictor->GetWavDuration() << " ms, "
<< "RTF: " << predictor->GetRTF() << std::endl; << "RTF: " << predictor->GetRTF() << std::endl;
......
...@@ -38,6 +38,7 @@ If the download speed is too slow, you can open [third-party/CMakeLists.txt](thi ...@@ -38,6 +38,7 @@ If the download speed is too slow, you can open [third-party/CMakeLists.txt](thi
``` ```
## Run ## Run
You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model.
``` ```
./run_front_demo.sh ./run_front_demo.sh
......
#include <string> // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//#include "utils/dir_utils.h" //
#include "front/front_interface.h" // Licensed under the Apache License, Version 2.0 (the "License");
#include <glog/logging.h> // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h>
#include <map> #include <map>
#include <string>
#include "front/front_interface.h"
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized"); DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file"); DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
//DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid"); // DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
int main(int argc, char** argv) { int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
// 实例化文本前端引擎 // 实例化文本前端引擎
ppspeech::FrontEngineInterface *front_inst = nullptr; ppspeech::FrontEngineInterface* front_inst = nullptr;
front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf); front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
if ((!front_inst) || (front_inst->init())) { if ((!front_inst) || (front_inst->init())) {
LOG(ERROR) << "Creater tts engine failed!"; LOG(ERROR) << "Creater tts engine failed!";
...@@ -28,7 +41,7 @@ int main(int argc, char** argv) { ...@@ -28,7 +41,7 @@ int main(int argc, char** argv) {
// 繁体转简体 // 繁体转简体
std::wstring sentence_simp; std::wstring sentence_simp;
front_inst->Trand2Simp(ws_sentence, sentence_simp); front_inst->Trand2Simp(ws_sentence, &sentence_simp);
ws_sentence = sentence_simp; ws_sentence = sentence_simp;
std::string s_sentence; std::string s_sentence;
...@@ -38,28 +51,29 @@ int main(int argc, char** argv) { ...@@ -38,28 +51,29 @@ int main(int argc, char** argv) {
// 根据标点进行分句 // 根据标点进行分句
LOG(INFO) << "Start to segment sentences by punctuation"; LOG(INFO) << "Start to segment sentences by punctuation";
front_inst->SplitByPunc(ws_sentence, sentence_part); front_inst->SplitByPunc(ws_sentence, &sentence_part);
LOG(INFO) << "Segment sentences through punctuation successfully"; LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id // 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence"; LOG(INFO)
for(int i = 0; i < sentence_part.size(); i++) { << "Start to get the phoneme and tone id sequence of each sentence";
for (int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]); LOG(INFO) << "Raw sentence is: "
front_inst->SentenceNormalize(sentence_part[i]); << ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(&sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence; LOG(INFO) << "After normalization sentence is: " << s_sentence;
if (0 != front_inst->GetSentenceIds(s_sentence, phoneids, toneids)) { if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1; return -1;
} }
} }
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " "); LOG(INFO) << "The phoneids of the sentence is: "
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " "); << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: "
<< limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
# !/usr/bin/env python3 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# -*- coding: utf-8 -*-
########################################################################
# #
# Copyright 2021 liangyunming(liangyunming@baidu.com) # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# Execute the script when PaddleSpeech has been installed # http://www.apache.org/licenses/LICENSE-2.0
# PaddleSpeech: https://github.com/PaddlePaddle/PaddleSpeech #
# Unless required by applicable law or agreed to in writing, software
######################################################################## # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import configparser import configparser
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=False, get_tone_ids=False):
def get_phone(frontend,
word,
merge_sentences=True,
print_info=False,
robot=False,
get_tone_ids=False):
phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot) phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot)
# Some optimizations # Some optimizations
phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids) phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids)
...@@ -22,7 +31,10 @@ def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=Fals ...@@ -22,7 +31,10 @@ def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=Fals
return phones, tones return phones, tones
def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=False): def gen_word2phone_dict(frontend,
jieba_words_dict,
word2phone_dict,
get_tone=False):
with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2: with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2:
for line in f1.readlines(): for line in f1.readlines():
word = line.split(" ")[0] word = line.split(" ")[0]
...@@ -30,9 +42,9 @@ def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=Fa ...@@ -30,9 +42,9 @@ def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=Fa
phone_str = "" phone_str = ""
if tone: if tone:
assert(len(phone) == len(tone)) assert (len(phone) == len(tone))
for i in range(len(tone)): for i in range(len(tone)):
phone_tone = phone[i] + tone[i] phone_tone = phone[i] + tone[i]
phone_str += (" " + phone_tone) phone_str += (" " + phone_tone)
phone_str = phone_str.strip("sp0").strip(" ") phone_str = phone_str.strip("sp0").strip(" ")
else: else:
...@@ -45,43 +57,55 @@ def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=Fa ...@@ -45,43 +57,55 @@ def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=Fa
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description="Generate dictionary")
description="Generate dictionary")
parser.add_argument( parser.add_argument(
"--config", type=str, default="./config.ini", help="config file.") "--config", type=str, default="./config.ini", help="config file.")
parser.add_argument( parser.add_argument(
"--am_type", type=str, default="fastspeech2", help="fastspeech2 or speedyspeech") "--am_type",
type=str,
default="fastspeech2",
help="fastspeech2 or speedyspeech")
args = parser.parse_args() args = parser.parse_args()
# Read config # Read config
cf = configparser.ConfigParser() cf = configparser.ConfigParser()
cf.read(args.config) cf.read(args.config)
jieba_words_dict_file = cf.get("jieba", "jieba_words_dict") # get words dict jieba_words_dict_file = cf.get("jieba",
"jieba_words_dict") # get words dict
am_type = args.am_type am_type = args.am_type
if(am_type == "fastspeech2"): if (am_type == "fastspeech2"):
phone2id_dict_file = cf.get(am_type, "phone2id_dict") phone2id_dict_file = cf.get(am_type, "phone2id_dict")
word2phone_dict_file = cf.get(am_type, "word2phone_dict") word2phone_dict_file = cf.get(am_type, "word2phone_dict")
frontend = Frontend(phone_vocab_path=phone2id_dict_file) frontend = Frontend(phone_vocab_path=phone2id_dict_file)
print("frontend done!") print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=False) gen_word2phone_dict(
frontend,
elif(am_type == "speedyspeech"): jieba_words_dict_file,
word2phone_dict_file,
get_tone=False)
elif (am_type == "speedyspeech"):
phone2id_dict_file = cf.get(am_type, "phone2id_dict") phone2id_dict_file = cf.get(am_type, "phone2id_dict")
tone2id_dict_file = cf.get(am_type, "tone2id_dict") tone2id_dict_file = cf.get(am_type, "tone2id_dict")
word2phone_dict_file = cf.get(am_type, "word2phone_dict") word2phone_dict_file = cf.get(am_type, "word2phone_dict")
frontend = Frontend(phone_vocab_path=phone2id_dict_file, tone_vocab_path=tone2id_dict_file) frontend = Frontend(
phone_vocab_path=phone2id_dict_file,
tone_vocab_path=tone2id_dict_file)
print("frontend done!") print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=True) gen_word2phone_dict(
frontend,
jieba_words_dict_file,
word2phone_dict_file,
get_tone=True)
else: else:
print("Please set correct am type, fastspeech2 or speedyspeech.") print("Please set correct am type, fastspeech2 or speedyspeech.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
#from parakeet.frontend.vocab import Vocab # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
PHONESFILE = "./dict/phones.txt" PHONESFILE = "./dict/phones.txt"
PHONES_ID_FILE = "./dict/phonesid.dict" PHONES_ID_FILE = "./dict/phonesid.dict"
TONESFILE = "./dict/tones.txt" TONESFILE = "./dict/tones.txt"
TONES_ID_FILE = "./dict/tonesid.dict" TONES_ID_FILE = "./dict/tonesid.dict"
def GenIdFile(file, idfile): def GenIdFile(file, idfile):
id = 2 id = 2
with open(file, 'r') as f1, open(idfile, "w+") as f2: with open(file, 'r') as f1, open(idfile, "w+") as f2:
...@@ -16,7 +29,7 @@ def GenIdFile(file, idfile): ...@@ -16,7 +29,7 @@ def GenIdFile(file, idfile):
f2.write(phone + " " + str(id) + "\n") f2.write(phone + " " + str(id) + "\n")
id += 1 id += 1
if __name__ == "__main__": if __name__ == "__main__":
GenIdFile(PHONESFILE, PHONES_ID_FILE) GenIdFile(PHONESFILE, PHONES_ID_FILE)
GenIdFile(TONESFILE, TONES_ID_FILE) GenIdFile(TONESFILE, TONES_ID_FILE)
from pypinyin import lazy_pinyin, Style # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re import re
from pypinyin import lazy_pinyin
from pypinyin import Style
worddict = "./dict/jieba_part.dict.utf8" worddict = "./dict/jieba_part.dict.utf8"
newdict = "./dict/word_phones.dict" newdict = "./dict/word_phones.dict"
def GenPhones(initials, finals, seperate=True): def GenPhones(initials, finals, seperate=True):
phones = [] phones = []
...@@ -14,9 +30,9 @@ def GenPhones(initials, finals, seperate=True): ...@@ -14,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
elif c in ['zh', 'ch', 'sh', 'r']: elif c in ['zh', 'ch', 'sh', 'r']:
v = re.sub('i', 'iii', v) v = re.sub('i', 'iii', v)
if c: if c:
if seperate == True: if seperate is True:
phones.append(c + '0') phones.append(c + '0')
elif seperate == False: elif seperate is False:
phones.append(c) phones.append(c)
else: else:
print("Not sure whether phone and tone need to be separated") print("Not sure whether phone and tone need to be separated")
...@@ -28,8 +44,10 @@ def GenPhones(initials, finals, seperate=True): ...@@ -28,8 +44,10 @@ def GenPhones(initials, finals, seperate=True):
with open(worddict, "r") as f1, open(newdict, "w+") as f2: with open(worddict, "r") as f1, open(newdict, "w+") as f2:
for line in f1.readlines(): for line in f1.readlines():
word = line.split(" ")[0] word = line.split(" ")[0]
initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) initials = lazy_pinyin(
finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) word, neutral_tone_with_five=True, style=Style.INITIALS)
finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
phones = GenPhones(initials, finals, True) phones = GenPhones(initials, finals, True)
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/type_conv.h" #include "base/type_conv.h"
namespace ppspeech { namespace ppspeech {
// wstring to string // wstring to string
std::string wstring2utf8string(const std::wstring& str) std::string wstring2utf8string(const std::wstring& str) {
{ static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
static std::wstring_convert<std::codecvt_utf8<wchar_t> > strCnv;
return strCnv.to_bytes(str); return strCnv.to_bytes(str);
} }
// string to wstring
std::wstring utf8string2wstring(const std::string& str)
{
static std::wstring_convert< std::codecvt_utf8<wchar_t> > strCnv;
return strCnv.from_bytes(str);
}
// string to wstring
std::wstring utf8string2wstring(const std::string& str) {
static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
return strCnv.from_bytes(str);
} }
} // namespace ppspeech
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BASE_TYPE_CONVC_H #ifndef BASE_TYPE_CONVC_H
#define BASE_TYPE_CONVC_H #define BASE_TYPE_CONVC_H
#include <string>
#include <locale>
#include <codecvt> #include <codecvt>
#include <locale>
#include <string>
namespace ppspeech { namespace ppspeech {
// wstring to string // wstring to string
std::string wstring2utf8string(const std::wstring& str); std::string wstring2utf8string(const std::wstring& str);
// string to wstring
std::wstring utf8string2wstring(const std::string& str);
// string to wstring
std::wstring utf8string2wstring(const std::string& str);
} }
#endif // BASE_TYPE_CONVC_H #endif // BASE_TYPE_CONVC_H
\ No newline at end of file
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H #ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H #define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#include <glog/logging.h>
#include <fstream>
#include <map> #include <map>
#include <string>
#include <memory> #include <memory>
#include <fstream> #include <string>
#include <glog/logging.h>
//#include "utils/dir_utils.h" //#include "utils/dir_utils.h"
#include <cppjieba/Jieba.hpp> #include <cppjieba/Jieba.hpp>
#include "front/text_normalize.h"
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "front/text_normalize.h"
namespace ppspeech { namespace ppspeech {
class FrontEngineInterface : public TextNormalizer{
public:
FrontEngineInterface(std::string conf) : _conf_file(conf) {
TextNormalizer();
_jieba = nullptr;
_initialed = false;
init();
}
int init();
~FrontEngineInterface() {
}
// 读取配置文件
int ReadConfFile();
// 简体转繁体
int Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp);
// 生成字典
int GenDict(const std::string &file, std::map<std::string, std::string> &map);
// 由 词+词性的分词结果转为仅包含词的结果
int GetSegResult(std::vector<std::pair<std::string, std::string>> &seg, std::vector<std::string> &seg_words);
// 生成句子的音素,音调id。如果音素和音调未分开,则 toneids 为空(fastspeech2),反之则不为空(speedyspeech)
int GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids);
// 根据分词结果获取词的音素,音调id,并对读音进行适当修改 (ModifyTone)。如果音素和音调未分开,则 toneids 为空(fastspeech2),反之则不为空(speedyspeech)
int GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids, std::vector<int> &toneids);
// 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改 (MergeforModify)
int Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result);
// 字词到音素的映射,查找字典
int GetPhone(const std::string &word, std::string &phone);
// 音素到音素id
int Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneids);
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
bool AllToneThree(const std::vector<std::string> &finals);
// 判断词是否是叠词
bool IsReduplication(const std::string &word);
// 获取每个字词的声母韵母列表
int GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals);
// 获取每个字词的韵母列表
int GetFinals(const std::string &word, std::vector<std::string> &word_finals);
// 整个词转成向量形式,向量的每个元素对应词的一个字 class FrontEngineInterface : public TextNormalizer {
int Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec); public:
explicit FrontEngineInterface(std::string conf) : _conf_file(conf) {
TextNormalizer();
_jieba = nullptr;
_initialed = false;
init();
}
// 将整个词重新进行 full cut,分词后,各个词会在词典中 int init();
int SplitWord(const std::string &word, std::vector<std::string> &fullcut_word); ~FrontEngineInterface() {}
// 对分词结果进行处理:对包含“不”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对包含“一”字的分词结果进行整理 // 读取配置文件
std::vector<std::pair<std::string, std::string>> Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result); int ReadConfFile();
// 对分词结果进行处理:对前后相同的两个字进行合并 // 简体转繁体
std::vector<std::pair<std::string, std::string>> MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result); int Trand2Simp(const std::wstring &sentence, std::wstring *sentence_simp);
// 对一个词和后一个词他们的读音均为第三声的两个词进行合并 // 生成字典
std::vector<std::pair<std::string, std::string>> MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result); int GenDict(const std::string &file,
std::map<std::string, std::string> *map);
// 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并 // 由 词+词性的分词结果转为仅包含词的结果
std::vector<std::pair<std::string, std::string>> MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result); int GetSegResult(std::vector<std::pair<std::string, std::string>> *seg,
std::vector<std::string> *seg_words);
// 对分词结果进行处理:对包含“儿”字的分词结果进行整理 // 生成句子的音素,音调id。如果音素和音调未分开,则 toneids
std::vector<std::pair<std::string, std::string>> MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result); // 为空(fastspeech2),反之则不为空(speedyspeech)
int GetSentenceIds(const std::string &sentence,
std::vector<int> *phoneids,
std::vector<int> *toneids);
// 对分词结果进行处理、修改 // 根据分词结果获取词的音素,音调id,并对读音进行适当修改
int MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_result, std::vector<std::pair<std::string, std::string>> &merge_seg_result); // (ModifyTone)。如果音素和音调未分开,则 toneids
// 为空(fastspeech2),反之则不为空(speedyspeech)
int GetWordsIds(
const std::vector<std::pair<std::string, std::string>> &cut_result,
std::vector<int> *phoneids,
std::vector<int> *toneids);
// 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改
// (MergeforModify)
int Cut(const std::string &sentence,
std::vector<std::pair<std::string, std::string>> *cut_result);
// 对包含“不”字的相关词音调进行修改 // 字词到音素的映射,查找字典
int BuSandi(const std::string &word, std::vector<std::string> &finals); int GetPhone(const std::string &word, std::string *phone);
// 对包含“一”字的相关词音调进行修改 // 音素到音素id
int YiSandhi(const std::string &word, std::vector<std::string> &finals); int Phone2Phoneid(const std::string &phone,
std::vector<int> *phoneid,
std::vector<int> *toneids);
// 对一些特殊词(包括量词,语助词等)的相关词音调进行修改
int NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals);
// 对包含第三声的相关词音调进行修改 // 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
int ThreeSandhi(const std::string &word, std::vector<std::string> &finals); bool AllToneThree(const std::vector<std::string> &finals);
// 对字词音调进行处理、修改 // 判断词是否是叠词
int ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals); bool IsReduplication(const std::string &word);
// 获取每个字词的声母韵母列表
int GetInitialsFinals(const std::string &word,
std::vector<std::string> *word_initials,
std::vector<std::string> *word_finals);
// 获取每个字词的韵母列表
int GetFinals(const std::string &word,
std::vector<std::string> *word_finals);
// 整个词转成向量形式,向量的每个元素对应词的一个字
int Word2WordVec(const std::string &word,
std::vector<std::wstring> *wordvec);
// 将整个词重新进行 full cut,分词后,各个词会在词典中
int SplitWord(const std::string &word,
std::vector<std::string> *fullcut_word);
// 对分词结果进行处理:对包含“不”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeBu(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对分词结果进行处理:对包含“一”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> Mergeyi(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对分词结果进行处理:对前后相同的两个字进行合并
std::vector<std::pair<std::string, std::string>> MergeReduplication(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对一个词和后一个词他们的读音均为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones2(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对分词结果进行处理:对包含“儿”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeEr(
std::vector<std::pair<std::string, std::string>> *seg_result);
// 对分词结果进行处理、修改
int MergeforModify(
std::vector<std::pair<std::string, std::string>> *seg_result,
std::vector<std::pair<std::string, std::string>> *merge_seg_result);
// 对儿化音进行处理
std::vector<std::vector<std::string>> MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos);
// 对包含“不”字的相关词音调进行修改
int BuSandi(const std::string &word, std::vector<std::string> *finals);
private: // 对包含“一”字的相关词音调进行修改
bool _initialed; int YiSandhi(const std::string &word, std::vector<std::string> *finals);
cppjieba::Jieba *_jieba;
std::vector<std::string> _punc; // 对一些特殊词(包括量词,语助词等)的相关词音调进行修改
std::vector<std::string> _punc_omit; int NeuralSandhi(const std::string &word,
const std::string &pos,
std::vector<std::string> *finals);
std::string _conf_file; // 对包含第三声的相关词音调进行修改
std::map<std::string, std::string> conf_map; int ThreeSandhi(const std::string &word, std::vector<std::string> *finals);
std::map<std::string, std::string> word_phone_map;
std::map<std::string, std::string> phone_id_map; // 对字词音调进行处理、修改
std::map<std::string, std::string> tone_id_map; int ModifyTone(const std::string &word,
std::map<std::string, std::string> trand_simp_map; const std::string &pos,
std::vector<std::string> *finals);
std::string _jieba_dict_path; // 对儿化音进行处理
std::string _jieba_hmm_path; std::vector<std::vector<std::string>> MergeErhua(
std::string _jieba_user_dict_path; const std::vector<std::string> &initials,
std::string _jieba_idf_path; const std::vector<std::string> &finals,
std::string _jieba_stop_word_path; const std::string &word,
const std::string &pos);
std::string _seperate_tone; private:
std::string _word2phone_path; bool _initialed;
std::string _phone2id_path; cppjieba::Jieba *_jieba;
std::string _tone2id_path; std::vector<std::string> _punc;
std::string _trand2simp_path; std::vector<std::string> _punc_omit;
std::vector<std::string> must_erhua; std::string _conf_file;
std::vector<std::string> not_erhua; std::map<std::string, std::string> conf_map;
std::map<std::string, std::string> word_phone_map;
std::map<std::string, std::string> phone_id_map;
std::map<std::string, std::string> tone_id_map;
std::map<std::string, std::string> trand_simp_map;
std::vector<std::string> must_not_neural_tone_words;
std::vector<std::string> must_neural_tone_words;
std::string _jieba_dict_path;
std::string _jieba_hmm_path;
std::string _jieba_user_dict_path;
std::string _jieba_idf_path;
std::string _jieba_stop_word_path;
std::string _seperate_tone;
std::string _word2phone_path;
std::string _phone2id_path;
std::string _tone2id_path;
std::string _trand2simp_path;
std::vector<std::string> must_erhua;
std::vector<std::string> not_erhua;
}; std::vector<std::string> must_not_neural_tone_words;
} std::vector<std::string> must_neural_tone_words;
};
} // namespace ppspeech
#endif #endif
\ No newline at end of file
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H #ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H #define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#include <glog/logging.h>
#include <codecvt>
#include <map> #include <map>
#include <regex> #include <regex>
#include <string> #include <string>
#include <codecvt>
#include <glog/logging.h>
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "absl/strings/strip.h" #include "absl/strings/strip.h"
#include "base/type_conv.h" #include "base/type_conv.h"
...@@ -13,50 +26,52 @@ ...@@ -13,50 +26,52 @@
namespace ppspeech { namespace ppspeech {
class TextNormalizer { class TextNormalizer {
public: public:
TextNormalizer() { TextNormalizer() { InitMap(); }
InitMap(); ~TextNormalizer() {}
}
~TextNormalizer() {
}
int InitMap(); int InitMap();
int Replace(std::wstring &sentence, const int &pos, const int &len, const std::wstring &repstr); int Replace(std::wstring *sentence,
int SplitByPunc(const std::wstring &sentence, std::vector<std::wstring> &sentence_part); const int &pos,
const int &len,
const std::wstring &repstr);
int SplitByPunc(const std::wstring &sentence,
std::vector<std::wstring> *sentence_part);
std::string CreateTextValue(const std::string &num, bool use_zero=true); std::string CreateTextValue(const std::string &num, bool use_zero = true);
std::string SingleDigit2Text(const std::string &num_str, bool alt_one = false); std::string SingleDigit2Text(const std::string &num_str,
bool alt_one = false);
std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false); std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false);
std::string MultiDigit2Text(const std::string &num_str, bool alt_one = false, bool use_zero = true); std::string MultiDigit2Text(const std::string &num_str,
std::string MultiDigit2Text(const std::wstring &num, bool alt_one = false, bool use_zero = true); bool alt_one = false,
bool use_zero = true);
std::string MultiDigit2Text(const std::wstring &num,
bool alt_one = false,
bool use_zero = true);
std::string Digits2Text(const std::string &num_str); std::string Digits2Text(const std::string &num_str);
std::string Digits2Text(const std::wstring &num); std::string Digits2Text(const std::wstring &num);
int ReData(std::wstring &sentence); int ReData(std::wstring *sentence);
int ReData2(std::wstring &sentence); int ReData2(std::wstring *sentence);
int ReTime(std::wstring &sentence); int ReTime(std::wstring *sentence);
int ReTemperature(std::wstring &sentence); int ReTemperature(std::wstring *sentence);
int ReFrac(std::wstring &sentence); int ReFrac(std::wstring *sentence);
int RePercentage(std::wstring &sentence); int RePercentage(std::wstring *sentence);
int ReMobilePhone(std::wstring &sentence); int ReMobilePhone(std::wstring *sentence);
int RePhone(std::wstring &sentence); int RePhone(std::wstring *sentence);
int ReRange(std::wstring &sentence); int ReRange(std::wstring *sentence);
int ReInterger(std::wstring &sentence); int ReInterger(std::wstring *sentence);
int ReDecimalNum(std::wstring &sentence); int ReDecimalNum(std::wstring *sentence);
int RePositiveQuantifiers(std::wstring &sentence); int RePositiveQuantifiers(std::wstring *sentence);
int ReDefalutNum(std::wstring &sentence); int ReDefalutNum(std::wstring *sentence);
int ReNumber(std::wstring &sentence); int ReNumber(std::wstring *sentence);
int SentenceNormalize(std::wstring &sentence); int SentenceNormalize(std::wstring *sentence);
private:
std::map<std::string, std::string> digits_map;
std::map<int, std::string> units_map;
private:
std::map<std::string, std::string> digits_map;
std::map<int, std::string> units_map;
}; };
} // namespace ppspeech
}
#endif #endif
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册