Skip to content

Commit

Permalink
feat: いくつかのAPIを露出し、「テキスト音声合成の流れ」を明確に (#1025)
Browse files Browse the repository at this point in the history
次のAPIをパブリックAPIとして露出させ、それに関連するテストとドキュメント
を用意する。このテストとドキュメントが主目的。テストでは各ショートハンド
がショートハンドとして機能するか確かめ、ドキュメントにおいては「テキスト
音声合成の流れ」というものを用意してMermaidのflowchart図を載せる。また各
ショートハンドメソッドについて、何のショートハンドなのかを明記するように
する。

- `OpenJtalk::analyze`/`TextAnalyzer::analyze`
    (Rust APIでは既に露出済み)
- `Synthesizer::open_jtalk`/`Synthesizer::text_analyzer`
- `AudioQuery::from_accent_phrases`

`kana`系のAPIについては今回はノータッチ。
  • Loading branch information
qryxip authored Mar 1, 2025
1 parent 5793fbc commit 741f6e3
Show file tree
Hide file tree
Showing 31 changed files with 1,327 additions and 214 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion crates/voicevox_core/src/__internal/interop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ pub use crate::{
convert::ToJsonValue,
metas::merge as merge_metas,
synthesizer::{
blocking::PerformInference, DEFAULT_CPU_NUM_THREADS, DEFAULT_ENABLE_INTERROGATIVE_UPSPEAK,
blocking::PerformInference, BlockingTextAnalyzerExt, NonblockingTextAnalyzerExt,
DEFAULT_CPU_NUM_THREADS, DEFAULT_ENABLE_INTERROGATIVE_UPSPEAK,
DEFAULT_HEAVY_INFERENCE_CANCELLABLE, MARGIN,
},
user_dict::{DEFAULT_PRIORITY, DEFAULT_WORD_TYPE},
Expand Down
72 changes: 72 additions & 0 deletions crates/voicevox_core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,78 @@
//! https://doc.rust-lang.org/cargo/reference/build-scripts.html#rustc-link-lib
//! [`Onnxruntime`]: blocking::Onnxruntime
//! [ONNX RuntimeのGPU機能]: https://onnxruntime.ai/docs/execution-providers/
//!
//! # 音声の調整
//!
//! ユーザーガイドの[テキスト音声合成の流れ]を参照。
//!
//! 以下の`wav1`から`wav4`はすべて同一となる。
//!
//! [テキスト音声合成の流れ]: https://github.com/VOICEVOX/voicevox_core/blob/main/docs/guide/user/tts-process.md
//!
//! ```
//! use std::collections::HashSet;
//!
//! use voicevox_core::{
//! blocking::{Synthesizer, TextAnalyzer},
//! AudioQuery, StyleId,
//! };
//! #
//! # use test_util::{ONNXRUNTIME_DYLIB_PATH, OPEN_JTALK_DIC_DIR, SAMPLE_VOICE_MODEL_FILE_PATH};
//! # use voicevox_core::blocking::{Onnxruntime, OpenJtalk, VoiceModelFile};
//!
//! fn f(synth: &Synthesizer<impl TextAnalyzer>) -> anyhow::Result<()> {
//! # const TEXT: &str = "";
//! # #[cfg(any())]
//! const TEXT: &str = _;
//! #
//! # const STYLE_ID: StyleId = StyleId(0);
//! # #[cfg(any())]
//! const STYLE_ID: StyleId = _;
//!
//! let wav1 = synth.tts(TEXT, STYLE_ID).perform()?;
//!
//! let wav2 = {
//! let query = synth.create_audio_query(TEXT, STYLE_ID)?;
//! synth.synthesis(&query, STYLE_ID).perform()?
//! };
//!
//! let wav3 = {
//! let phrases = synth.create_accent_phrases(TEXT, STYLE_ID)?;
//! let query = AudioQuery::from(phrases);
//! synth.synthesis(&query, STYLE_ID).perform()?
//! };
//!
//! let wav4 = {
//! let phrases = synth.text_analyzer().analyze(TEXT)?;
//! let phrases = synth.replace_mora_data(&phrases, STYLE_ID)?;
//! let query = AudioQuery::from(phrases);
//! synth.synthesis(&query, STYLE_ID).perform()?
//! };
//!
//! let wav5 = {
//! let phrases = synth.text_analyzer().analyze(TEXT)?;
//! let phrases = synth.replace_phoneme_length(&phrases, STYLE_ID)?;
//! let phrases = synth.replace_mora_pitch(&phrases, STYLE_ID)?;
//! let query = AudioQuery::from(phrases);
//! synth.synthesis(&query, STYLE_ID).perform()?
//! };
//!
//! assert_eq!(1, HashSet::from([wav1, wav2, wav3, wav4, wav5]).len());
//! Ok(())
//! }
//! #
//! # let synth = &{
//! # let ort = Onnxruntime::load_once()
//! # .filename(ONNXRUNTIME_DYLIB_PATH)
//! # .perform()?;
//! # let ojt = OpenJtalk::new(OPEN_JTALK_DIC_DIR)?;
//! # Synthesizer::builder(ort).text_analyzer(ojt).build()?
//! # };
//! # synth.load_voice_model(&VoiceModelFile::open(SAMPLE_VOICE_MODEL_FILE_PATH)?)?;
//! # f(synth)?;
//! # anyhow::Ok(())
//! ```
#![cfg_attr(docsrs, feature(doc_cfg))]

Expand Down
4 changes: 2 additions & 2 deletions crates/voicevox_core/src/metas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ impl Display for CharacterVersion {
pub type VoiceModelMeta = Vec<CharacterMeta>;

/// <i>キャラクター</i>のメタ情報。
#[derive(Deserialize, Serialize, Clone)]
#[derive(Deserialize, Serialize, Clone, Debug)]
#[non_exhaustive]
pub struct CharacterMeta {
/// キャラクター名。
Expand Down Expand Up @@ -142,7 +142,7 @@ impl CharacterMeta {
}

/// <i>スタイル</i>のメタ情報。
#[derive(Deserialize, Serialize, Clone)]
#[derive(Deserialize, Serialize, Clone, Debug)]
#[non_exhaustive]
pub struct StyleMeta {
/// スタイルID。
Expand Down
121 changes: 108 additions & 13 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use easy_ext::ext;
use enum_map::enum_map;
use std::{marker::PhantomData, ops::Range, sync::Arc};
use futures_util::TryFutureExt as _;
use std::{future::Future, marker::PhantomData, ops::Range, sync::Arc};
use tracing::info;

use crate::{
Expand All @@ -26,7 +27,6 @@ use crate::{
},
InferenceRuntime, InferenceSessionOptions,
},
nonblocking::TextAnalyzer as _,
status::Status,
voice_model, AccentPhrase, AudioQuery, Result, StyleId, VoiceModelId, VoiceModelMeta,
};
Expand Down Expand Up @@ -664,14 +664,7 @@ trait AsInner {
where
Self::TextAnalyzer: crate::nonblocking::TextAnalyzer,
{
let accent_phrases =
self.text_analyzer()
.analyze(text)
.await
.map_err(|source| ErrorRepr::AnalyzeText {
text: text.to_owned(),
source,
})?;
let accent_phrases = self.text_analyzer().analyze_(text).await?;
self.replace_mora_data(&accent_phrases, style_id).await
}

Expand All @@ -680,7 +673,7 @@ trait AsInner {
Self::TextAnalyzer: crate::nonblocking::TextAnalyzer,
{
let accent_phrases = self.create_accent_phrases(text, style_id).await?;
Ok(AudioQuery::from_accent_phrases(accent_phrases))
Ok(accent_phrases.into())
}

async fn tts(
Expand Down Expand Up @@ -1200,10 +1193,51 @@ fn list_windows_video_cards() {
}

impl AudioQuery {
fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
/// アクセント句の配列からAudioQueryを作る。
#[doc(alias = "voicevox_audio_query_create_from_accent_phrases")]
pub fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
let kana = create_kana(&accent_phrases);
Self {
accent_phrases,
kana: Some(kana),
..Default::default()
}
}
}

#[ext(BlockingTextAnalyzerExt)]
impl<T: crate::blocking::TextAnalyzer> T {
pub fn analyze_(&self, text: &str) -> crate::Result<Vec<AccentPhrase>> {
self.analyze(text).map_err(|source| {
ErrorRepr::AnalyzeText {
text: text.to_owned(),
source,
}
.into()
})
}
}

#[ext(NonblockingTextAnalyzerExt)]
impl<T: crate::nonblocking::TextAnalyzer> T {
pub fn analyze_(
&self,
text: &str,
) -> impl Future<Output = crate::Result<Vec<AccentPhrase>>> + Send {
self.analyze(text).map_err(|source| {
ErrorRepr::AnalyzeText {
text: text.to_owned(),
source,
}
.into()
})
}
}

impl Default for AudioQuery {
fn default() -> Self {
Self {
accent_phrases: vec![],
speed_scale: 1.,
pitch_scale: 0.,
intonation_scale: 1.,
Expand All @@ -1212,11 +1246,17 @@ impl AudioQuery {
post_phoneme_length: 0.1,
output_sampling_rate: DEFAULT_SAMPLING_RATE,
output_stereo: false,
kana: Some(kana),
kana: None,
}
}
}

impl From<Vec<AccentPhrase>> for AudioQuery {
fn from(accent_phrases: Vec<AccentPhrase>) -> Self {
Self::from_accent_phrases(accent_phrases)
}
}

#[expect(
clippy::too_many_arguments,
reason = "`PerformInference::predict_intonation`用。compatible_engineでの`predict_intonation`の\
Expand Down Expand Up @@ -1290,6 +1330,11 @@ pub(crate) mod blocking {
self.0.onnxruntime()
}

/// テキスト解析器。
pub fn text_analyzer(&self) -> &T {
&self.0.text_analyzer().0
}

/// ハードウェアアクセラレーションがGPUモードか判定する。
#[doc(alias = "voicevox_synthesizer_is_gpu_mode")]
pub fn is_gpu_mode(&self) -> bool {
Expand Down Expand Up @@ -1410,6 +1455,12 @@ pub(crate) mod blocking {
}

/// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
///
/// [`replace_phoneme_length`]と[`replace_mora_pitch`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// [`replace_phoneme_length`]: Self::replace_phoneme_length
/// [`replace_mora_pitch`]: Self::replace_mora_pitch
/// [音声の調整]: ../index.html#音声の調整
#[doc(alias = "voicevox_synthesizer_replace_mora_data")]
pub fn replace_mora_data(
&self,
Expand Down Expand Up @@ -1498,6 +1549,8 @@ pub(crate) mod blocking {
impl<T: crate::blocking::TextAnalyzer> self::Synthesizer<T> {
/// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
///
/// [`TextAnalyzer::analyze`]と[`replace_mora_data`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// # Example
///
/// ```
Expand All @@ -1521,6 +1574,10 @@ pub(crate) mod blocking {
/// # Ok(())
/// # }
/// ```
///
/// [`TextAnalyzer::analyze`]: crate::blocking::TextAnalyzer::analyze
/// [`replace_mora_data`]: Self::replace_mora_data
/// [音声の調整]: ../index.html#音声の調整
#[doc(alias = "voicevox_synthesizer_create_accent_phrases")]
pub fn create_accent_phrases(
&self,
Expand All @@ -1532,6 +1589,8 @@ pub(crate) mod blocking {

/// 日本語のテキストから[AudioQuery]を生成する。
///
/// [`create_accent_phrases`]と[`AudioQuery::from_accent_phrases`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// # Examples
///
/// ```
Expand All @@ -1557,6 +1616,8 @@ pub(crate) mod blocking {
/// ```
///
/// [AudioQuery]: crate::AudioQuery
/// [`create_accent_phrases`]: Self::create_accent_phrases
/// [音声の調整]: ../index.html#音声の調整
#[doc(alias = "voicevox_synthesizer_create_audio_query")]
pub fn create_audio_query(
&self,
Expand All @@ -1567,6 +1628,12 @@ pub(crate) mod blocking {
}

/// 日本語のテキストから音声合成を行う。
///
/// [`create_audio_query`]と[`synthesis`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// [`create_audio_query`]: Self::create_audio_query
/// [`synthesis`]: Self::synthesis
/// [音声の調整]: ../index.html#音声の調整
#[doc(alias = "voicevox_synthesizer_tts")]
pub fn tts<'a>(&'a self, text: &'a str, style_id: StyleId) -> Tts<'a, T> {
Tts {
Expand Down Expand Up @@ -1712,6 +1779,7 @@ pub(crate) mod blocking {
}

impl<T> Builder<T> {
/// テキスト解析器。
pub fn text_analyzer<T2>(self, text_analyzer: T2) -> Builder<T2> {
Builder {
text_analyzer,
Expand Down Expand Up @@ -1900,6 +1968,11 @@ pub(crate) mod nonblocking {
crate::nonblocking::Onnxruntime::from_blocking(self.0.onnxruntime())
}

/// テキスト解析器。
pub fn text_analyzer(&self) -> &T {
self.0.text_analyzer()
}

/// ハードウェアアクセラレーションがGPUモードか判定する。
pub fn is_gpu_mode(&self) -> bool {
self.0.is_gpu_mode()
Expand Down Expand Up @@ -1986,6 +2059,12 @@ pub(crate) mod nonblocking {
}

/// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
///
/// [`replace_phoneme_length`]と[`replace_mora_pitch`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// [`replace_phoneme_length`]: Self::replace_phoneme_length
/// [`replace_mora_pitch`]: Self::replace_mora_pitch
/// [音声の調整]: ../index.html#音声の調整
pub async fn replace_mora_data(
&self,
accent_phrases: &[AccentPhrase],
Expand Down Expand Up @@ -2068,6 +2147,8 @@ pub(crate) mod nonblocking {
impl<T: crate::nonblocking::TextAnalyzer> self::Synthesizer<T> {
/// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
///
/// [`TextAnalyzer::analyze`]と[`replace_mora_data`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// # Example
///
/// ```
Expand All @@ -2090,6 +2171,10 @@ pub(crate) mod nonblocking {
/// # Ok(())
/// # }
/// ```
///
/// [`TextAnalyzer::analyze`]: crate::nonblocking::TextAnalyzer::analyze
/// [`replace_mora_data`]: Self::replace_mora_data
/// [音声の調整]: ../index.html#音声の調整
pub async fn create_accent_phrases(
&self,
text: &str,
Expand All @@ -2100,6 +2185,8 @@ pub(crate) mod nonblocking {

/// 日本語のテキストから[AudioQuery]を生成する。
///
/// [`create_accent_phrases`]と[`AudioQuery::from_accent_phrases`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// # Examples
///
/// ```
Expand All @@ -2124,6 +2211,8 @@ pub(crate) mod nonblocking {
/// ```
///
/// [AudioQuery]: crate::AudioQuery
/// [`create_accent_phrases`]: Self::create_accent_phrases
/// [音声の調整]: ../index.html#音声の調整
pub async fn create_audio_query(
&self,
text: &str,
Expand All @@ -2134,10 +2223,15 @@ pub(crate) mod nonblocking {

/// 日本語のテキストから音声合成を行う。
///
/// [`create_audio_query`]と[`synthesis`]が一体になったショートハンド。詳細は[音声の調整]の節。
///
/// # Caveats
///
/// [`cancellable`]を有効化しない限り、非同期タスクとしてキャンセルしても終わるまで停止しない。
///
/// [`create_audio_query`]: Self::create_audio_query
/// [`synthesis`]: Self::synthesis
/// [音声の調整]: ../index.html#音声の調整
/// [`cancellable`]: Tts::cancellable
pub fn tts<'a>(&'a self, text: &'a str, style_id: StyleId) -> Tts<'a, T> {
Tts {
Expand Down Expand Up @@ -2165,6 +2259,7 @@ pub(crate) mod nonblocking {
}

impl<T> Builder<T> {
/// テキスト解析器。
pub fn text_analyzer<T2>(self, text_analyzer: T2) -> Builder<T2> {
Builder {
text_analyzer,
Expand Down
Loading

0 comments on commit 741f6e3

Please sign in to comment.