feat: いくつかのAPIを露出し、「テキスト音声合成の流れ」を明確に (#1025)

次のAPIをパブリックAPIとして露出させ、それに関連するテストとドキュメントを用意する。このテストとドキュメントが主目的。テストでは各ショートハンドがショートハンドとして機能するか確かめ、ドキュメントにおいては「テキスト音声合成の流れ」というものを用意してMermaidのflowchart図を載せる。また各ショートハンドメソッドについて、何のショートハンドなのかを明記するようにする。 - `OpenJtalk::analyze`/`TextAnalyzer::analyze` (Rust APIでは既に露出済み) - `Synthesizer::open_jtalk`/`Synthesizer::text_analyzer` - `AudioQuery::from_accent_phrases` `kana`系のAPIについては今回はノータッチ。
VOICEVOX · Mar 1, 2025 · 741f6e3 · 741f6e3
1 parent 5793fbc
commit 741f6e3
Show file tree

Hide file tree

Showing 31 changed files with 1,327 additions and 214 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/voicevox_core/src/__internal/interop.rs b/crates/voicevox_core/src/__internal/interop.rs
@@ -4,7 +4,8 @@ pub use crate::{
     convert::ToJsonValue,
     metas::merge as merge_metas,
     synthesizer::{
-        blocking::PerformInference, DEFAULT_CPU_NUM_THREADS, DEFAULT_ENABLE_INTERROGATIVE_UPSPEAK,
+        blocking::PerformInference, BlockingTextAnalyzerExt, NonblockingTextAnalyzerExt,
+        DEFAULT_CPU_NUM_THREADS, DEFAULT_ENABLE_INTERROGATIVE_UPSPEAK,
         DEFAULT_HEAVY_INFERENCE_CANCELLABLE, MARGIN,
     },
     user_dict::{DEFAULT_PRIORITY, DEFAULT_WORD_TYPE},

diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs
@@ -15,6 +15,78 @@
 //! https://doc.rust-lang.org/cargo/reference/build-scripts.html#rustc-link-lib
 //! [`Onnxruntime`]: blocking::Onnxruntime
 //! [ONNX RuntimeのGPU機能]: https://onnxruntime.ai/docs/execution-providers/
+//!
+//! # 音声の調整
+//!
+//! ユーザーガイドの[テキスト音声合成の流れ]を参照。
+//!
+//! 以下の`wav1`から`wav4`はすべて同一となる。
+//!
+//! [テキスト音声合成の流れ]: https://github.com/VOICEVOX/voicevox_core/blob/main/docs/guide/user/tts-process.md
+//!
+//! ```
+//! use std::collections::HashSet;
+//!
+//! use voicevox_core::{
+//!     blocking::{Synthesizer, TextAnalyzer},
+//!     AudioQuery, StyleId,
+//! };
+//! #
+//! # use test_util::{ONNXRUNTIME_DYLIB_PATH, OPEN_JTALK_DIC_DIR, SAMPLE_VOICE_MODEL_FILE_PATH};
+//! # use voicevox_core::blocking::{Onnxruntime, OpenJtalk, VoiceModelFile};
+//!
+//! fn f(synth: &Synthesizer<impl TextAnalyzer>) -> anyhow::Result<()> {
+//! #    const TEXT: &str = "";
+//! #   #[cfg(any())]
+//!     const TEXT: &str = _;
+//! #
+//! #   const STYLE_ID: StyleId = StyleId(0);
+//! #   #[cfg(any())]
+//!     const STYLE_ID: StyleId = _;
+//!
+//!     let wav1 = synth.tts(TEXT, STYLE_ID).perform()?;
+//!
+//!     let wav2 = {
+//!         let query = synth.create_audio_query(TEXT, STYLE_ID)?;
+//!         synth.synthesis(&query, STYLE_ID).perform()?
+//!     };
+//!
+//!     let wav3 = {
+//!         let phrases = synth.create_accent_phrases(TEXT, STYLE_ID)?;
+//!         let query = AudioQuery::from(phrases);
+//!         synth.synthesis(&query, STYLE_ID).perform()?
+//!     };
+//!
+//!     let wav4 = {
+//!         let phrases = synth.text_analyzer().analyze(TEXT)?;
+//!         let phrases = synth.replace_mora_data(&phrases, STYLE_ID)?;
+//!         let query = AudioQuery::from(phrases);
+//!         synth.synthesis(&query, STYLE_ID).perform()?
+//!     };
+//!
+//!     let wav5 = {
+//!         let phrases = synth.text_analyzer().analyze(TEXT)?;
+//!         let phrases = synth.replace_phoneme_length(&phrases, STYLE_ID)?;
+//!         let phrases = synth.replace_mora_pitch(&phrases, STYLE_ID)?;
+//!         let query = AudioQuery::from(phrases);
+//!         synth.synthesis(&query, STYLE_ID).perform()?
+//!     };
+//!
+//!     assert_eq!(1, HashSet::from([wav1, wav2, wav3, wav4, wav5]).len());
+//!     Ok(())
+//! }
+//! #
+//! # let synth = &{
+//! #     let ort = Onnxruntime::load_once()
+//! #         .filename(ONNXRUNTIME_DYLIB_PATH)
+//! #         .perform()?;
+//! #     let ojt = OpenJtalk::new(OPEN_JTALK_DIC_DIR)?;
+//! #     Synthesizer::builder(ort).text_analyzer(ojt).build()?
+//! # };
+//! # synth.load_voice_model(&VoiceModelFile::open(SAMPLE_VOICE_MODEL_FILE_PATH)?)?;
+//! # f(synth)?;
+//! # anyhow::Ok(())
+//! ```
 
 #![cfg_attr(docsrs, feature(doc_cfg))]
 

diff --git a/crates/voicevox_core/src/metas.rs b/crates/voicevox_core/src/metas.rs
@@ -82,7 +82,7 @@ impl Display for CharacterVersion {
 pub type VoiceModelMeta = Vec<CharacterMeta>;
 
 /// <i>キャラクター</i>のメタ情報。
-#[derive(Deserialize, Serialize, Clone)]
+#[derive(Deserialize, Serialize, Clone, Debug)]
 #[non_exhaustive]
 pub struct CharacterMeta {
     /// キャラクター名。
@@ -142,7 +142,7 @@ impl CharacterMeta {
 }
 
 /// <i>スタイル</i>のメタ情報。
-#[derive(Deserialize, Serialize, Clone)]
+#[derive(Deserialize, Serialize, Clone, Debug)]
 #[non_exhaustive]
 pub struct StyleMeta {
     /// スタイルID。

diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
@@ -1,6 +1,7 @@
 use easy_ext::ext;
 use enum_map::enum_map;
-use std::{marker::PhantomData, ops::Range, sync::Arc};
+use futures_util::TryFutureExt as _;
+use std::{future::Future, marker::PhantomData, ops::Range, sync::Arc};
 use tracing::info;
 
 use crate::{
@@ -26,7 +27,6 @@ use crate::{
         },
         InferenceRuntime, InferenceSessionOptions,
     },
-    nonblocking::TextAnalyzer as _,
     status::Status,
     voice_model, AccentPhrase, AudioQuery, Result, StyleId, VoiceModelId, VoiceModelMeta,
 };
@@ -664,14 +664,7 @@ trait AsInner {
     where
         Self::TextAnalyzer: crate::nonblocking::TextAnalyzer,
     {
-        let accent_phrases =
-            self.text_analyzer()
-                .analyze(text)
-                .await
-                .map_err(|source| ErrorRepr::AnalyzeText {
-                    text: text.to_owned(),
-                    source,
-                })?;
+        let accent_phrases = self.text_analyzer().analyze_(text).await?;
         self.replace_mora_data(&accent_phrases, style_id).await
     }
 
@@ -680,7 +673,7 @@ trait AsInner {
         Self::TextAnalyzer: crate::nonblocking::TextAnalyzer,
     {
         let accent_phrases = self.create_accent_phrases(text, style_id).await?;
-        Ok(AudioQuery::from_accent_phrases(accent_phrases))
+        Ok(accent_phrases.into())
     }
 
     async fn tts(
@@ -1200,10 +1193,51 @@ fn list_windows_video_cards() {
 }
 
 impl AudioQuery {
-    fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
+    /// アクセント句の配列からAudioQueryを作る。
+    #[doc(alias = "voicevox_audio_query_create_from_accent_phrases")]
+    pub fn from_accent_phrases(accent_phrases: Vec<AccentPhrase>) -> Self {
         let kana = create_kana(&accent_phrases);
         Self {
             accent_phrases,
+            kana: Some(kana),
+            ..Default::default()
+        }
+    }
+}
+
+#[ext(BlockingTextAnalyzerExt)]
+impl<T: crate::blocking::TextAnalyzer> T {
+    pub fn analyze_(&self, text: &str) -> crate::Result<Vec<AccentPhrase>> {
+        self.analyze(text).map_err(|source| {
+            ErrorRepr::AnalyzeText {
+                text: text.to_owned(),
+                source,
+            }
+            .into()
+        })
+    }
+}
+
+#[ext(NonblockingTextAnalyzerExt)]
+impl<T: crate::nonblocking::TextAnalyzer> T {
+    pub fn analyze_(
+        &self,
+        text: &str,
+    ) -> impl Future<Output = crate::Result<Vec<AccentPhrase>>> + Send {
+        self.analyze(text).map_err(|source| {
+            ErrorRepr::AnalyzeText {
+                text: text.to_owned(),
+                source,
+            }
+            .into()
+        })
+    }
+}
+
+impl Default for AudioQuery {
+    fn default() -> Self {
+        Self {
+            accent_phrases: vec![],
             speed_scale: 1.,
             pitch_scale: 0.,
             intonation_scale: 1.,
@@ -1212,11 +1246,17 @@ impl AudioQuery {
             post_phoneme_length: 0.1,
             output_sampling_rate: DEFAULT_SAMPLING_RATE,
             output_stereo: false,
-            kana: Some(kana),
+            kana: None,
         }
     }
 }
 
+impl From<Vec<AccentPhrase>> for AudioQuery {
+    fn from(accent_phrases: Vec<AccentPhrase>) -> Self {
+        Self::from_accent_phrases(accent_phrases)
+    }
+}
+
 #[expect(
     clippy::too_many_arguments,
     reason = "`PerformInference::predict_intonation`用。compatible_engineでの`predict_intonation`の\
@@ -1290,6 +1330,11 @@ pub(crate) mod blocking {
             self.0.onnxruntime()
         }
 
+        /// テキスト解析器。
+        pub fn text_analyzer(&self) -> &T {
+            &self.0.text_analyzer().0
+        }
+
         /// ハードウェアアクセラレーションがGPUモードか判定する。
         #[doc(alias = "voicevox_synthesizer_is_gpu_mode")]
         pub fn is_gpu_mode(&self) -> bool {
@@ -1410,6 +1455,12 @@ pub(crate) mod blocking {
         }
 
         /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
+        ///
+        /// [`replace_phoneme_length`]と[`replace_mora_pitch`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
+        /// [`replace_phoneme_length`]: Self::replace_phoneme_length
+        /// [`replace_mora_pitch`]: Self::replace_mora_pitch
+        /// [音声の調整]: ../index.html#音声の調整
         #[doc(alias = "voicevox_synthesizer_replace_mora_data")]
         pub fn replace_mora_data(
             &self,
@@ -1498,6 +1549,8 @@ pub(crate) mod blocking {
     impl<T: crate::blocking::TextAnalyzer> self::Synthesizer<T> {
         /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
         ///
+        /// [`TextAnalyzer::analyze`]と[`replace_mora_data`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
         /// # Example
         ///
         /// ```
@@ -1521,6 +1574,10 @@ pub(crate) mod blocking {
         /// # Ok(())
         /// # }
         /// ```
+        ///
+        /// [`TextAnalyzer::analyze`]: crate::blocking::TextAnalyzer::analyze
+        /// [`replace_mora_data`]: Self::replace_mora_data
+        /// [音声の調整]: ../index.html#音声の調整
         #[doc(alias = "voicevox_synthesizer_create_accent_phrases")]
         pub fn create_accent_phrases(
             &self,
@@ -1532,6 +1589,8 @@ pub(crate) mod blocking {
 
         /// 日本語のテキストから[AudioQuery]を生成する。
         ///
+        /// [`create_accent_phrases`]と[`AudioQuery::from_accent_phrases`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
         /// # Examples
         ///
         /// ```
@@ -1557,6 +1616,8 @@ pub(crate) mod blocking {
         /// ```
         ///
         /// [AudioQuery]: crate::AudioQuery
+        /// [`create_accent_phrases`]: Self::create_accent_phrases
+        /// [音声の調整]: ../index.html#音声の調整
         #[doc(alias = "voicevox_synthesizer_create_audio_query")]
         pub fn create_audio_query(
             &self,
@@ -1567,6 +1628,12 @@ pub(crate) mod blocking {
         }
 
         /// 日本語のテキストから音声合成を行う。
+        ///
+        /// [`create_audio_query`]と[`synthesis`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
+        /// [`create_audio_query`]: Self::create_audio_query
+        /// [`synthesis`]: Self::synthesis
+        /// [音声の調整]: ../index.html#音声の調整
         #[doc(alias = "voicevox_synthesizer_tts")]
         pub fn tts<'a>(&'a self, text: &'a str, style_id: StyleId) -> Tts<'a, T> {
             Tts {
@@ -1712,6 +1779,7 @@ pub(crate) mod blocking {
     }
 
     impl<T> Builder<T> {
+        /// テキスト解析器。
         pub fn text_analyzer<T2>(self, text_analyzer: T2) -> Builder<T2> {
             Builder {
                 text_analyzer,
@@ -1900,6 +1968,11 @@ pub(crate) mod nonblocking {
             crate::nonblocking::Onnxruntime::from_blocking(self.0.onnxruntime())
         }
 
+        /// テキスト解析器。
+        pub fn text_analyzer(&self) -> &T {
+            self.0.text_analyzer()
+        }
+
         /// ハードウェアアクセラレーションがGPUモードか判定する。
         pub fn is_gpu_mode(&self) -> bool {
             self.0.is_gpu_mode()
@@ -1986,6 +2059,12 @@ pub(crate) mod nonblocking {
         }
 
         /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。
+        ///
+        /// [`replace_phoneme_length`]と[`replace_mora_pitch`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
+        /// [`replace_phoneme_length`]: Self::replace_phoneme_length
+        /// [`replace_mora_pitch`]: Self::replace_mora_pitch
+        /// [音声の調整]: ../index.html#音声の調整
         pub async fn replace_mora_data(
             &self,
             accent_phrases: &[AccentPhrase],
@@ -2068,6 +2147,8 @@ pub(crate) mod nonblocking {
     impl<T: crate::nonblocking::TextAnalyzer> self::Synthesizer<T> {
         /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。
         ///
+        /// [`TextAnalyzer::analyze`]と[`replace_mora_data`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
         /// # Example
         ///
         /// ```
@@ -2090,6 +2171,10 @@ pub(crate) mod nonblocking {
         /// # Ok(())
         /// # }
         /// ```
+        ///
+        /// [`TextAnalyzer::analyze`]: crate::nonblocking::TextAnalyzer::analyze
+        /// [`replace_mora_data`]: Self::replace_mora_data
+        /// [音声の調整]: ../index.html#音声の調整
         pub async fn create_accent_phrases(
             &self,
             text: &str,
@@ -2100,6 +2185,8 @@ pub(crate) mod nonblocking {
 
         /// 日本語のテキストから[AudioQuery]を生成する。
         ///
+        /// [`create_accent_phrases`]と[`AudioQuery::from_accent_phrases`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
         /// # Examples
         ///
         /// ```
@@ -2124,6 +2211,8 @@ pub(crate) mod nonblocking {
         /// ```
         ///
         /// [AudioQuery]: crate::AudioQuery
+        /// [`create_accent_phrases`]: Self::create_accent_phrases
+        /// [音声の調整]: ../index.html#音声の調整
         pub async fn create_audio_query(
             &self,
             text: &str,
@@ -2134,10 +2223,15 @@ pub(crate) mod nonblocking {
 
         /// 日本語のテキストから音声合成を行う。
         ///
+        /// [`create_audio_query`]と[`synthesis`]が一体になったショートハンド。詳細は[音声の調整]の節。
+        ///
         /// # Caveats
         ///
         /// [`cancellable`]を有効化しない限り、非同期タスクとしてキャンセルしても終わるまで停止しない。
         ///
+        /// [`create_audio_query`]: Self::create_audio_query
+        /// [`synthesis`]: Self::synthesis
+        /// [音声の調整]: ../index.html#音声の調整
         /// [`cancellable`]: Tts::cancellable
         pub fn tts<'a>(&'a self, text: &'a str, style_id: StyleId) -> Tts<'a, T> {
             Tts {
@@ -2165,6 +2259,7 @@ pub(crate) mod nonblocking {
     }
 
     impl<T> Builder<T> {
+        /// テキスト解析器。
         pub fn text_analyzer<T2>(self, text_analyzer: T2) -> Builder<T2> {
             Builder {
                 text_analyzer,