From c8deaad4e49a132393a6ba77463aeb20827ff93e Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 16 Nov 2024 11:12:29 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20`pause=5Flength{,=5Fscale}`=E3=82=92?= =?UTF-8?q?=E3=83=87=E3=83=95=E3=82=A9=E3=83=AB=E3=83=88=E5=80=A4=E9=99=90?= =?UTF-8?q?=E5=AE=9A=E3=81=A7=E5=8F=97=E3=81=91=E5=85=A5=E3=82=8C=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VOICEVOX/voicevox_engine#1308 と VOICEVOX/voicevox_engine#1425 の一部 を参考にコードを書いた。 @Hiroshiba さんと以下の2名の許諾のもと、 #874 にのっとりMITライセンスと してライセンスする。 * @X-20A (VOICEVOX/voicevox_engine#1308) * @sabonerune (VOICEVOX/voicevox_engine#1425) Co-Authored-By: X-20A <155217226+X-20A@users.noreply.github.com> Co-Authored-By: sabonerune <102559104+sabonerune@users.noreply.github.com> Co-authored-by: Hiroshiba Refs: https://github.com/VOICEVOX/voicevox_core/issues/874#issuecomment-2489461101 Refs: https://github.com/VOICEVOX/voicevox_core/issues/874#issuecomment-2482637392 --- crates/voicevox_core/src/engine/model.rs | 140 +++++++++++++++++- crates/voicevox_core/src/synthesizer.rs | 2 + .../jp/hiroshiba/voicevoxcore/AudioQuery.java | 13 ++ .../python/voicevox_core/_models.py | 6 + 4 files changed, 160 insertions(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs index 20203feb3..fad10f493 100644 --- a/crates/voicevox_core/src/engine/model.rs +++ b/crates/voicevox_core/src/engine/model.rs @@ -1,4 +1,7 @@ -use serde::{Deserialize, Serialize}; +use std::fmt; + +use duplicate::duplicate_item; +use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; /* 各フィールドのjsonフィールド名はsnake_caseとする*/ @@ -64,6 +67,20 @@ pub struct AudioQuery { pub output_sampling_rate: u32, /// 音声データをステレオ出力するか否か。 pub output_stereo: bool, + /// 句読点などの無音時間。`null`のときは無視される。デフォルト値は`null`。 + #[serde( + default, + deserialize_with = "deserialize_pause_length", + serialize_with = "serialize_pause_length" + )] + pub pause_length: (), + /// 読点などの無音時間(倍率)。デフォルト値は`1`。 + #[serde( + default, + deserialize_with = "deserialize_pause_length_scale", + serialize_with = "serialize_pause_length_scale" + )] + pub pause_length_scale: (), /// \[読み取り専用\] AquesTalk風記法。 /// /// [`Synthesizer::audio_query`]が返すもののみ`Some`となる。入力としてのAudioQueryでは無視され @@ -73,6 +90,87 @@ pub struct AudioQuery { pub kana: Option, } +fn deserialize_pause_length<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: Deserializer<'de>, +{ + return deserializer.deserialize_any(Visitor); + + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = (); + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("`null`") + } + + #[duplicate_item( + method T; + [ visit_i64 ] [ i64 ]; + [ visit_u64 ] [ u64 ]; + [ visit_f64 ] [ f64 ]; + )] + fn method(self, _: T) -> Result + where + E: de::Error, + { + Err(E::custom("currently `pause_length` must be `null`")) + } + + fn visit_unit(self) -> Result { + Ok(()) + } + } +} + +fn serialize_pause_length(_: &(), serializer: S) -> Result +where + S: Serializer, +{ + serializer.serialize_unit() +} + +fn deserialize_pause_length_scale<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: Deserializer<'de>, +{ + return deserializer.deserialize_any(Visitor); + + struct Visitor; + + impl<'de> de::Visitor<'de> for Visitor { + type Value = (); + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("`1.`") + } + + #[duplicate_item( + method T ONE; + [ visit_i64 ] [ i64 ] [ 1 ]; + [ visit_u64 ] [ u64 ] [ 1 ]; + [ visit_f64 ] [ f64 ] [ 1. ]; + )] + fn method(self, v: T) -> Result + where + E: de::Error, + { + if v != ONE { + return Err(E::custom("currently `pause_length_scale` must be `1.`")); + } + Ok(()) + } + } +} + +fn serialize_pause_length_scale(_: &(), serializer: S) -> Result +where + S: Serializer, +{ + (1.).serialize(serializer) +} + impl AudioQuery { pub(crate) fn with_kana(self, kana: Option) -> Self { Self { kana, ..self } @@ -99,6 +197,8 @@ mod tests { post_phoneme_length: 0.0, output_sampling_rate: 0, output_stereo: false, + pause_length: (), + pause_length_scale: (), kana: None, }; let val = serde_json::to_value(audio_query_model).unwrap(); @@ -152,4 +252,42 @@ mod tests { }))?; Ok(()) } + + // TODO: 型的に自明になったらこのテストは削除する + #[rstest] + fn it_denies_non_null_for_pause_length() { + serde_json::from_value::(json!({ + "accent_phrases": [], + "speed_scale": 1.0, + "pitch_scale": 0.0, + "intonation_scale": 1.0, + "volume_scale": 1.0, + "pre_phoneme_length": 0.1, + "post_phoneme_length": 0.1, + "output_sampling_rate": 24000, + "output_stereo": false, + "pause_length": "aaaaa" + })) + .map(|_| ()) + .unwrap_err(); + } + + // TODO: 型的に自明になったらこのテストは削除する + #[rstest] + fn it_denies_non_float_for_pause_length_scale() { + serde_json::from_value::(json!({ + "accent_phrases": [], + "speed_scale": 1.0, + "pitch_scale": 0.0, + "intonation_scale": 1.0, + "volume_scale": 1.0, + "pre_phoneme_length": 0.1, + "post_phoneme_length": 0.1, + "output_sampling_rate": 24000, + "output_stereo": false, + "pause_length_scale": "aaaaa", + })) + .map(|_| ()) + .unwrap_err(); + } } diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index b68d1ab2c..ebe06e270 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1185,6 +1185,8 @@ mod inner { post_phoneme_length: 0.1, output_sampling_rate: DEFAULT_SAMPLING_RATE, output_stereo: false, + pause_length: (), + pause_length_scale: (), kana: Some(kana), } } diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java index 2f50c7235..afc735034 100644 --- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java +++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java @@ -55,6 +55,17 @@ public class AudioQuery { @Expose public boolean outputStereo; + /** 句読点などの無音時間。{@code null}のときは無視される。デフォルト値は{@code null}。 */ + @SerializedName("pause_length") + @Expose + @Nullable + public Double pauseLength; + + /** 読点などの無音時間(倍率)。デフォルト値は{@code 1.}。 */ + @SerializedName("pause_length_scale") + @Expose + public double pauseLengthScale; + /** * [読み取り専用] AquesTalk風記法。 * @@ -75,6 +86,8 @@ public AudioQuery() { this.prePhonemeLength = 0.1; this.postPhonemeLength = 0.1; this.outputSamplingRate = 24000; + this.pauseLength = null; + this.pauseLengthScale = 1.0; this.kana = null; } } diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py index 941ed84fc..9af47148a 100644 --- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py +++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py @@ -208,6 +208,12 @@ class AudioQuery: output_stereo: bool """音声データをステレオ出力するか否か。""" + pause_length: None = None + """句読点などの無音時間。 ``None`` のときは無視される。デフォルト値は ``None`` 。""" + + pause_length_scale: float = 1.0 + """読点などの無音時間(倍率)。デフォルト値は ``1.0`` 。""" + kana: Optional[str] = None """ [読み取り専用] AquesTalk風記法。