diff --git a/app/src/main/java/com/grammatek/simaromur/AppRepository.java b/app/src/main/java/com/grammatek/simaromur/AppRepository.java index 9347b63c..c472f24d 100644 --- a/app/src/main/java/com/grammatek/simaromur/AppRepository.java +++ b/app/src/main/java/com/grammatek/simaromur/AppRepository.java @@ -145,7 +145,7 @@ public boolean isCurrentVoice(Voice voice) { */ public void downloadVoiceAsync(Voice voice, DownloadVoiceManager.Observer finishedObserver) { // when the download is successful, the voice is updated in the database. This happens - // asynchonously. + // asynchronously. mDVM.downloadVoiceAsync(voice, finishedObserver, mVoiceDao); } @@ -466,10 +466,10 @@ public void startNetworkTTS(Voice voice, CacheItem item, TTSRequest ttsRequest, Log.v(LOG_TAG, "startNetworkTTS: " + item.getUuid()); // map given voice to voiceId if (voice != null) { - final TTSObserver ttsObserver = new TTSObserver(pitch, speed, AudioManager.SAMPLE_RATE_WAV); + final TTSObserver ttsObserver = new TTSObserver(pitch, speed, mNetworkSpeakController.getNativeSampleRate()); if (playIfAudioCacheHit(voice.internalName, voice.version, item, ttsObserver, ttsRequest)) return; - final String SampleRate = "" + AudioManager.SAMPLE_RATE_WAV; + final String SampleRate = "" + mNetworkSpeakController.getNativeSampleRate(); final String normalized = item.getUtterance().getNormalized(); if (normalized.trim().isEmpty()) { Log.w(LOG_TAG, "startNetworkTTS: given text is whitespace only ?!"); @@ -529,8 +529,7 @@ public TTSEngineController.SpeakTask startDeviceSpeak(Voice voice, CacheItem ite e.printStackTrace(); return null; } - return mTTSEngineController.StartSpeak(item, speed, pitch, - mTTSEngineController.getEngine().GetNativeSampleRate(), observer, getCurrentTTsRequest()); + return mTTSEngineController.StartSpeak(item, speed, pitch, observer, getCurrentTTsRequest()); } /** @@ -587,6 +586,10 @@ public String getVersionOfVoice(String internalVoiceName) { return null; } + public int getVoiceNativeSampleRate() { + return mTTSEngineController.getEngine().GetNativeSampleRate(); + } + /** * Find if we have the specified language available. * Use our DB model to query availability of voices @@ -795,6 +798,7 @@ public void showTtsBackendWarningDialog(Context context) { */ public void speakAssetFile(SynthesisCallback callback, String assetFilename) { Log.v(LOG_TAG, "playAssetFile: " + assetFilename); + final int SAMPLE_RATE_ASSETS = 22050; try { InputStream inputStream = App.getContext().getAssets().open(assetFilename); int size = inputStream.available(); @@ -803,7 +807,7 @@ public void speakAssetFile(SynthesisCallback callback, String assetFilename) { Log.w(LOG_TAG, "playAssetFile: not enough bytes ?"); } // don't provide rawText: there are no speech marks to update - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, + callback.start(SAMPLE_RATE_ASSETS, AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); feedBytesToSynthesisCallback(callback, buffer, ""); callback.done(); @@ -850,7 +854,7 @@ public static void feedBytesToSynthesisCallback(SynthesisCallback callback, byte final int bytesConsumed = Math.min(maxBytes, bytesLeft); if (callback.hasStarted()) { // this feeds audio data to the callback, which will then be consumed by the TTS - // client. In case the current utterance is stopped(), all remaining audio data is + // client. In case the current utterance is stopped, all remaining audio data is // consumed and discarded and afterwards TTSService.onStopped() is executed. int cbStatus = callback.audioAvailable(buffer, offset, bytesConsumed); switch(cbStatus) { diff --git a/app/src/main/java/com/grammatek/simaromur/TTSService.java b/app/src/main/java/com/grammatek/simaromur/TTSService.java index 625a99f6..2f30d482 100644 --- a/app/src/main/java/com/grammatek/simaromur/TTSService.java +++ b/app/src/main/java/com/grammatek/simaromur/TTSService.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Optional; import java.util.Set; @@ -109,7 +108,7 @@ protected int onLoadLanguage(String language, String country, String variant) { * waiting for a TTSProcessingResult inside onSynthesizeText(). If afterwards the audio processing is * finished, the processing result is received and discarded, because the current utterance is * already finished and has changed. - * + *
* Note: mandatory, don't synchronize this method ! */ @Override @@ -156,7 +155,7 @@ protected void onSynthesizeText(SynthesisRequest request, loadedVoiceName = mRepository.getLoadedVoiceName(); } else { Log.w(LOG_TAG, "onSynthesizeText: couldn't load voice ("+voiceNameToLoad+")"); - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, + callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); callback.error(TextToSpeech.ERROR_SERVICE); if (callback.hasStarted() && ! callback.hasFinished()) { @@ -215,12 +214,12 @@ protected void onSynthesizeText(SynthesisRequest request, Log.v(LOG_TAG, "onSynthesizeText: finished (" + item.getUuid() + ")"); return; } - startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_WAV, true); + startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), true); setSpeechMarksToBeginning(callback); mRepository.startNetworkTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f); break; case com.grammatek.simaromur.db.Voice.TYPE_ONNX: - startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_ONNX, false); + startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), false); setSpeechMarksToBeginning(callback); mRepository.startDeviceTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f); break; @@ -251,7 +250,7 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item, // todo: we need to handle timeout errors here, e.g. processing // timeouts, some error, e.g. network timeouts are already taken care of TTSProcessingResult elem = mRepository.dequeueTTSProcessingResult(); - float rtf = estimateRTF(startTime, System.currentTimeMillis(), item, elem); + float rtf = estimateRTF(startTime, System.currentTimeMillis(), elem); Log.v(LOG_TAG, "estimateRTF: rtf=" + rtf); if (rtf > 500.0f && !isCached) { Log.w(LOG_TAG, "handleProcessingResult: rtf > 500.0f, something went wrong for the estimation"); @@ -320,11 +319,10 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item, * * @param startTimeMillis time when the processing started * @param stopTimeMillis time when the processing stopped - * @param item cache item * @param elem processing result * @return the real time factor */ - private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem item, TTSProcessingResult elem) { + private float estimateRTF(long startTimeMillis, long stopTimeMillis, TTSProcessingResult elem) { String uuid = elem.getTTSRequest().getCacheItemUuid(); Log.v(LOG_TAG, "estimateRTF for: " + uuid); @@ -336,7 +334,7 @@ private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem i // assume currently slowest used sample rate, i.e. 16kHz and 16 bit with 1 channel // TODO: we should use the real sample rate here, but this needs to be passed via the // TTSProcessingResult - final int sampleRate = AudioManager.SAMPLE_RATE_WAV; + final int sampleRate = mRepository.getVoiceNativeSampleRate(); final int bytesPerSample = 2; final int channels = 1; @@ -445,7 +443,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback, /** * Signal TTS client a TTS error with given error code. - * + *
* The sequence for signalling an error seems to be important: callback.start(),
* callback.error(), callback.done(). Any callback.audioAvailable() call after a callback.error()
* is ignored.
@@ -455,7 +453,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback,
*/
private void signalTtsError(SynthesisCallback callback, int errorCode) {
Log.w(LOG_TAG, "signalTtsError(): errorCode = " + errorCode);
- callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
+ callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT,
AudioManager.N_CHANNELS);
callback.error(errorCode);
callback.done();
@@ -467,12 +465,12 @@ private void signalTtsError(SynthesisCallback callback, int errorCode) {
*
* @param callback TTS callback provided in the onSynthesizeText() callback
*/
- private static void playSilence(SynthesisCallback callback) {
+ private void playSilence(SynthesisCallback callback) {
Log.v(LOG_TAG, "playSilence() ...");
- callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
- AudioManager.N_CHANNELS);
+ int sampleRate = mRepository.getVoiceNativeSampleRate();
+ callback.start(sampleRate, AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS);
setSpeechMarksToBeginning(callback);
- byte[] silenceData = AudioManager.generatePcmSilence(0.25f);
+ byte[] silenceData = AudioManager.generatePcmSilence(0.25f, sampleRate);
callback.audioAvailable(silenceData, 0, silenceData.length);
if (! callback.hasFinished() && callback.hasStarted()) {
callback.done();
diff --git a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
index 2443a247..e06cd21c 100644
--- a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
+++ b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
@@ -16,9 +16,9 @@ public class AudioManager {
private final static String LOG_TAG = "Simaromur_" + AudioManager.class.getSimpleName();
// Some constants used throughout audio conversion
- public static final int SAMPLE_RATE_WAV = 16000;
+ //public static final int SAMPLE_RATE_WAV = 16000;
public static final int SAMPLE_RATE_MP3 = 22050;
- public static final int SAMPLE_RATE_ONNX = 16000;
+ //public static final int SAMPLE_RATE_ONNX = 16000;
//public static final int SAMPLE_RATE_ONNX = 22050;
public static final int N_CHANNELS = 1;
@@ -79,26 +79,6 @@ static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, int sampleRate
return outputConversionStream.toByteArray();
}
- /**
- * Either apply pitch and speed to ttsData, resulting in a potentially differently sized output
- * buffer, or simply copy ttsData to the new output buffer, if no changes of speed or pitch
- * are requested.
- * Return the newly created output buffer.
- *
- * @param monoPcmData byte array of MONO PCM data to be used as input data. 22050 Hz sample rate
- * is expected
- * @param pitch pitch to be applied. 1.0f means no pitch change, values > 1.0 mean higher
- * pitch, values < 1.0 mean lower pitch than in given pcmData
- * @param speed speed to be applied. 1.0f means no speed change, values > 1.0 mean higher
- * speed, values < 1.0 mean lower speed than in given pcmData. This parameter
- * produces either more data for values >1.0, less data for values < 1.0, or
- * no data change for a value of 1.0
- * @return new byte array with converted PCM data
- */
- static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, float pitch, float speed) {
- return applyPitchAndSpeed(monoPcmData, SAMPLE_RATE_WAV, pitch, speed);
- }
-
/**
* Converts given float values to 16bits PCM. No resampling or interpolation is done.
* Floats are rounded to the nearest integer.
@@ -284,10 +264,10 @@ static public byte[] pcmFloatTo16BitPCMWithDither(float[] pcmFloats, float norma
return outBuf;
}
- static public byte[] generatePcmSilence(float duration) {
+ static public byte[] generatePcmSilence(float duration, int sampleRate) {
final int nChannels = 1;
final int nBits = 16;
- final int nSamples = (int) (duration * SAMPLE_RATE_WAV);
+ final int nSamples = (int) (duration * sampleRate);
final int nBytes = nSamples * nChannels * nBits / 8;
return new byte[nBytes];
}
diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
index 8cfadc12..f2a71cbd 100644
--- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
+++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
@@ -1,8 +1,11 @@
package com.grammatek.simaromur.device;
import static com.grammatek.simaromur.cache.AudioFormat.AUDIO_FMT_PCM;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_11KHZ;
import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_16KHZ;
import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_22KHZ;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_44_1KHZ;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_48KHZ;
import android.media.AudioFormat;
import android.util.Log;
@@ -42,7 +45,7 @@ public class TTSEngineController {
TTSEngine mEngine;
final ExecutorService mExecutorService;
Future> mTaskFuture; // the currently enqueued task, might be executed by the executor service
- final TTSAudioControl mTTSAudioControl16khz;
+ TTSAudioControl mTTSAudioControl;
/**
* Constructor
@@ -56,8 +59,6 @@ public TTSEngineController(AssetVoiceManager avm, DownloadVoiceManager dvm) {
mAVM = avm;
mDVM = dvm;
mCurrentVoice = null;
- mTTSAudioControl16khz = new TTSAudioControl(AudioManager.SAMPLE_RATE_ONNX,
- AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT);
// we only need one thread per Audio setting
mExecutorService = Executors.newSingleThreadExecutor();
}
@@ -84,6 +85,8 @@ public void LoadEngine(Voice voice) throws IOException {
Log.v(LOG_TAG, "LoadEngine: " + devVoice.Type);
try {
mEngine = new TTSEngineOnnx(App.getContext().getAssets(), devVoice);
+ mTTSAudioControl = new TTSAudioControl(mEngine.GetNativeSampleRate(),
+ AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT);
mCurrentVoice = devVoice;
} catch (IllegalArgumentException e) {
Log.e(LOG_TAG, "LoadEngine: " + e.getMessage());
@@ -115,9 +118,9 @@ public void UnloadEngine() {
* Start to speak given text with given voice.
*/
synchronized
- public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sampleRate,
+ public SpeakTask StartSpeak(CacheItem item, float speed, float pitch,
TTSAudioControl.AudioFinishedObserver observer, TTSRequest ttsRequest) {
- if (mEngine == null || mCurrentVoice == null) {
+ if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
String errorMsg = "No TTS engine loaded !";
Log.e(LOG_TAG, errorMsg);
throw new RuntimeException(errorMsg);
@@ -128,7 +131,7 @@ public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sample
mTaskFuture.cancel(true);
}
Log.v(LOG_TAG, "StartSpeak: scheduling new SpeakTask (1)");
- SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, sampleRate, observer, mCurrentVoice, ttsRequest);
+ SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, observer, mCurrentVoice, ttsRequest);
mTaskFuture = mExecutorService.submit(speakTask);
return speakTask;
}
@@ -159,7 +162,12 @@ public void StartSpeak(TTSObserver observer, TTSRequest ttsRequest) {
*/
synchronized
public void StopSpeak(TTSEngineController.SpeakTask speakTask) {
- mTTSAudioControl16khz.stop();
+ if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
+ String errorMsg = "StopSpeak(): No TTS engine loaded !";
+ Log.e(LOG_TAG, errorMsg);
+ throw new RuntimeException(errorMsg);
+ }
+ mTTSAudioControl.stop();
if (speakTask != null) {
speakTask.stopSynthesis();
}
@@ -175,7 +183,6 @@ public class SpeakTask implements Runnable {
CacheItem item;
float speed;
float pitch;
- int sampleRate;
TTSObserver observer;
TTSAudioControl.AudioFinishedObserver audioObserver;
boolean isStopped = false;
@@ -189,10 +196,9 @@ public class SpeakTask implements Runnable {
* speed
* @param pitch pitch multiplier of voice, how many times higher/lower than normal voice
* pitch
- * @param sampleRate sample rate to use for the synthesis
* @param ttsRequest request to be used for the synthesis
*/
- public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate,
+ public SpeakTask(String itemUuid, float speed, float pitch,
TTSAudioControl.AudioFinishedObserver audioObserver, DeviceVoice voice,
TTSRequest ttsRequest) {
this.ttsRequest = ttsRequest;
@@ -200,7 +206,6 @@ public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate,
this.item = optItem.orElse(null);
this.speed = speed;
this.pitch = pitch;
- this.sampleRate = sampleRate;
this.audioObserver = audioObserver;
this.observer = null;
this.voice = voice;
@@ -222,7 +227,6 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice)
this.observer = observer;
this.speed = observer.getSpeed();
this.pitch = observer.getPitch();
- this.sampleRate = mEngine.GetNativeSampleRate();
this.voice = voice;
}
@@ -233,7 +237,11 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice)
*/
public void run() {
Log.v(LOG_SPEAK_TASK_TAG, "run() called");
- assert(sampleRate == mEngine.GetNativeSampleRate());
+ if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
+ String errorMsg = "run(): No TTS engine loaded !";
+ Log.e(LOG_TAG, errorMsg);
+ throw new RuntimeException(errorMsg);
+ }
if (shouldStop()) {
Log.v(LOG_SPEAK_TASK_TAG, "run(): shouldStop(1): true");
@@ -278,10 +286,8 @@ public void run() {
if (observer == null) {
// TODO: also the media players should stop, if item has changed:
// - pass the cache item along
- byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, sampleRate, pitch, speed);
- if (sampleRate == AudioManager.SAMPLE_RATE_ONNX) {
- mTTSAudioControl16khz.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver));
- }
+ byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, mEngine.GetNativeSampleRate(), pitch, speed);
+ mTTSAudioControl.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver));
} else {
observer.update(audioData, ttsRequest);
}
@@ -314,13 +320,27 @@ private byte[] synthesizeSpeech(PhonemeEntry phonemeEntry) {
private boolean saveAudioToCacheEntry(PhonemeEntry phonemeEntry, byte[] bytes) {
SampleRate sampleRate;
- if (mEngine.GetNativeSampleRate() == 22050) {
- sampleRate = SAMPLE_RATE_22KHZ;
- } else if (mEngine.GetNativeSampleRate() == 16000) {
- sampleRate = SAMPLE_RATE_16KHZ;
- } else {
- throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate());
+ switch(mEngine.GetNativeSampleRate())
+ {
+ case 11025:
+ sampleRate = SAMPLE_RATE_11KHZ;
+ break;
+ case 16000:
+ sampleRate = SAMPLE_RATE_16KHZ;
+ break;
+ case 22050:
+ sampleRate = SAMPLE_RATE_22KHZ;
+ break;
+ case 44100:
+ sampleRate = SAMPLE_RATE_44_1KHZ;
+ break;
+ case 48000:
+ sampleRate = SAMPLE_RATE_48KHZ;
+ break;
+ default:
+ throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate());
}
+
final VoiceAudioDescription vad = UtteranceCacheManager.newAudioDescription(AUDIO_FMT_PCM,
sampleRate, bytes.length, mCurrentVoice.InternalName, mCurrentVoice.Version);
if (bytes.length == 0) {
diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
index 5d27c8df..c803c5f9 100644
--- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
+++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
@@ -35,13 +35,13 @@
public class TTSEngineOnnx implements TTSEngine {
private final static String LOG_TAG = "Simaromur_" + TTSEngineOnnx.class.getSimpleName();
- private final static int SAMPLE_RATE = 16000;
private final static float SENTENCE_PAUSE = 0.5f;
private static DeviceVoice sVoice = null;
// matches a position preceded by any of the characters '.!?;' not followed by zero or
// more whitespace characters ([\\s]*) and then a double quote (\").
- final static String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")";;
+ final static String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")";
+ final byte[] mPauseSilence;
private OrtEnvironment mOrtEnv;
private OrtSession mOrtSession;
private VitsConfig mModelConfig;
@@ -82,6 +82,13 @@ public TTSEngineOnnx(AssetManager asm, DeviceVoice voice) {
}
mPhoneConverter = new VitsPhoneConverter(mModelConfig.phonemeIdMap);
+ // check if sample rate of model is in a valid range between 11kHz and 48kHz
+ if (mModelConfig.audio.sampleRate < 11025 || mModelConfig.audio.sampleRate > 48000) {
+ throw new RuntimeException("Voice " + voice.Name + ": invalid sample rate " +
+ mModelConfig.audio.sampleRate + " Hz");
+ }
+ mPauseSilence = AudioManager.generatePcmSilence(SENTENCE_PAUSE, GetNativeSampleRate());
+
Log.v(LOG_TAG, "Onnx model loaded from assets/" + modelPath);
sVoice = voice;
}
@@ -146,7 +153,6 @@ public byte[] SpeakToPCM(String ipas) {
Instant startTime = Instant.now();
List