diff --git a/app/src/main/java/com/grammatek/simaromur/AppRepository.java b/app/src/main/java/com/grammatek/simaromur/AppRepository.java index 9347b63c..c472f24d 100644 --- a/app/src/main/java/com/grammatek/simaromur/AppRepository.java +++ b/app/src/main/java/com/grammatek/simaromur/AppRepository.java @@ -145,7 +145,7 @@ public boolean isCurrentVoice(Voice voice) { */ public void downloadVoiceAsync(Voice voice, DownloadVoiceManager.Observer finishedObserver) { // when the download is successful, the voice is updated in the database. This happens - // asynchonously. + // asynchronously. mDVM.downloadVoiceAsync(voice, finishedObserver, mVoiceDao); } @@ -466,10 +466,10 @@ public void startNetworkTTS(Voice voice, CacheItem item, TTSRequest ttsRequest, Log.v(LOG_TAG, "startNetworkTTS: " + item.getUuid()); // map given voice to voiceId if (voice != null) { - final TTSObserver ttsObserver = new TTSObserver(pitch, speed, AudioManager.SAMPLE_RATE_WAV); + final TTSObserver ttsObserver = new TTSObserver(pitch, speed, mNetworkSpeakController.getNativeSampleRate()); if (playIfAudioCacheHit(voice.internalName, voice.version, item, ttsObserver, ttsRequest)) return; - final String SampleRate = "" + AudioManager.SAMPLE_RATE_WAV; + final String SampleRate = "" + mNetworkSpeakController.getNativeSampleRate(); final String normalized = item.getUtterance().getNormalized(); if (normalized.trim().isEmpty()) { Log.w(LOG_TAG, "startNetworkTTS: given text is whitespace only ?!"); @@ -529,8 +529,7 @@ public TTSEngineController.SpeakTask startDeviceSpeak(Voice voice, CacheItem ite e.printStackTrace(); return null; } - return mTTSEngineController.StartSpeak(item, speed, pitch, - mTTSEngineController.getEngine().GetNativeSampleRate(), observer, getCurrentTTsRequest()); + return mTTSEngineController.StartSpeak(item, speed, pitch, observer, getCurrentTTsRequest()); } /** @@ -587,6 +586,10 @@ public String getVersionOfVoice(String internalVoiceName) { return null; } + public int getVoiceNativeSampleRate() { + return mTTSEngineController.getEngine().GetNativeSampleRate(); + } + /** * Find if we have the specified language available. * Use our DB model to query availability of voices @@ -795,6 +798,7 @@ public void showTtsBackendWarningDialog(Context context) { */ public void speakAssetFile(SynthesisCallback callback, String assetFilename) { Log.v(LOG_TAG, "playAssetFile: " + assetFilename); + final int SAMPLE_RATE_ASSETS = 22050; try { InputStream inputStream = App.getContext().getAssets().open(assetFilename); int size = inputStream.available(); @@ -803,7 +807,7 @@ public void speakAssetFile(SynthesisCallback callback, String assetFilename) { Log.w(LOG_TAG, "playAssetFile: not enough bytes ?"); } // don't provide rawText: there are no speech marks to update - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, + callback.start(SAMPLE_RATE_ASSETS, AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); feedBytesToSynthesisCallback(callback, buffer, ""); callback.done(); @@ -850,7 +854,7 @@ public static void feedBytesToSynthesisCallback(SynthesisCallback callback, byte final int bytesConsumed = Math.min(maxBytes, bytesLeft); if (callback.hasStarted()) { // this feeds audio data to the callback, which will then be consumed by the TTS - // client. In case the current utterance is stopped(), all remaining audio data is + // client. In case the current utterance is stopped, all remaining audio data is // consumed and discarded and afterwards TTSService.onStopped() is executed. int cbStatus = callback.audioAvailable(buffer, offset, bytesConsumed); switch(cbStatus) { diff --git a/app/src/main/java/com/grammatek/simaromur/TTSService.java b/app/src/main/java/com/grammatek/simaromur/TTSService.java index 625a99f6..2f30d482 100644 --- a/app/src/main/java/com/grammatek/simaromur/TTSService.java +++ b/app/src/main/java/com/grammatek/simaromur/TTSService.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Optional; import java.util.Set; @@ -109,7 +108,7 @@ protected int onLoadLanguage(String language, String country, String variant) { * waiting for a TTSProcessingResult inside onSynthesizeText(). If afterwards the audio processing is * finished, the processing result is received and discarded, because the current utterance is * already finished and has changed. - * + *

* Note: mandatory, don't synchronize this method ! */ @Override @@ -156,7 +155,7 @@ protected void onSynthesizeText(SynthesisRequest request, loadedVoiceName = mRepository.getLoadedVoiceName(); } else { Log.w(LOG_TAG, "onSynthesizeText: couldn't load voice ("+voiceNameToLoad+")"); - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, + callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); callback.error(TextToSpeech.ERROR_SERVICE); if (callback.hasStarted() && ! callback.hasFinished()) { @@ -215,12 +214,12 @@ protected void onSynthesizeText(SynthesisRequest request, Log.v(LOG_TAG, "onSynthesizeText: finished (" + item.getUuid() + ")"); return; } - startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_WAV, true); + startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), true); setSpeechMarksToBeginning(callback); mRepository.startNetworkTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f); break; case com.grammatek.simaromur.db.Voice.TYPE_ONNX: - startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_ONNX, false); + startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), false); setSpeechMarksToBeginning(callback); mRepository.startDeviceTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f); break; @@ -251,7 +250,7 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item, // todo: we need to handle timeout errors here, e.g. processing // timeouts, some error, e.g. network timeouts are already taken care of TTSProcessingResult elem = mRepository.dequeueTTSProcessingResult(); - float rtf = estimateRTF(startTime, System.currentTimeMillis(), item, elem); + float rtf = estimateRTF(startTime, System.currentTimeMillis(), elem); Log.v(LOG_TAG, "estimateRTF: rtf=" + rtf); if (rtf > 500.0f && !isCached) { Log.w(LOG_TAG, "handleProcessingResult: rtf > 500.0f, something went wrong for the estimation"); @@ -320,11 +319,10 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item, * * @param startTimeMillis time when the processing started * @param stopTimeMillis time when the processing stopped - * @param item cache item * @param elem processing result * @return the real time factor */ - private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem item, TTSProcessingResult elem) { + private float estimateRTF(long startTimeMillis, long stopTimeMillis, TTSProcessingResult elem) { String uuid = elem.getTTSRequest().getCacheItemUuid(); Log.v(LOG_TAG, "estimateRTF for: " + uuid); @@ -336,7 +334,7 @@ private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem i // assume currently slowest used sample rate, i.e. 16kHz and 16 bit with 1 channel // TODO: we should use the real sample rate here, but this needs to be passed via the // TTSProcessingResult - final int sampleRate = AudioManager.SAMPLE_RATE_WAV; + final int sampleRate = mRepository.getVoiceNativeSampleRate(); final int bytesPerSample = 2; final int channels = 1; @@ -445,7 +443,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback, /** * Signal TTS client a TTS error with given error code. - * + *

* The sequence for signalling an error seems to be important: callback.start(), * callback.error(), callback.done(). Any callback.audioAvailable() call after a callback.error() * is ignored. @@ -455,7 +453,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback, */ private void signalTtsError(SynthesisCallback callback, int errorCode) { Log.w(LOG_TAG, "signalTtsError(): errorCode = " + errorCode); - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, + callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); callback.error(errorCode); callback.done(); @@ -467,12 +465,12 @@ private void signalTtsError(SynthesisCallback callback, int errorCode) { * * @param callback TTS callback provided in the onSynthesizeText() callback */ - private static void playSilence(SynthesisCallback callback) { + private void playSilence(SynthesisCallback callback) { Log.v(LOG_TAG, "playSilence() ..."); - callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT, - AudioManager.N_CHANNELS); + int sampleRate = mRepository.getVoiceNativeSampleRate(); + callback.start(sampleRate, AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS); setSpeechMarksToBeginning(callback); - byte[] silenceData = AudioManager.generatePcmSilence(0.25f); + byte[] silenceData = AudioManager.generatePcmSilence(0.25f, sampleRate); callback.audioAvailable(silenceData, 0, silenceData.length); if (! callback.hasFinished() && callback.hasStarted()) { callback.done(); diff --git a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java index 2443a247..e06cd21c 100644 --- a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java +++ b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java @@ -16,9 +16,9 @@ public class AudioManager { private final static String LOG_TAG = "Simaromur_" + AudioManager.class.getSimpleName(); // Some constants used throughout audio conversion - public static final int SAMPLE_RATE_WAV = 16000; + //public static final int SAMPLE_RATE_WAV = 16000; public static final int SAMPLE_RATE_MP3 = 22050; - public static final int SAMPLE_RATE_ONNX = 16000; + //public static final int SAMPLE_RATE_ONNX = 16000; //public static final int SAMPLE_RATE_ONNX = 22050; public static final int N_CHANNELS = 1; @@ -79,26 +79,6 @@ static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, int sampleRate return outputConversionStream.toByteArray(); } - /** - * Either apply pitch and speed to ttsData, resulting in a potentially differently sized output - * buffer, or simply copy ttsData to the new output buffer, if no changes of speed or pitch - * are requested. - * Return the newly created output buffer. - * - * @param monoPcmData byte array of MONO PCM data to be used as input data. 22050 Hz sample rate - * is expected - * @param pitch pitch to be applied. 1.0f means no pitch change, values > 1.0 mean higher - * pitch, values < 1.0 mean lower pitch than in given pcmData - * @param speed speed to be applied. 1.0f means no speed change, values > 1.0 mean higher - * speed, values < 1.0 mean lower speed than in given pcmData. This parameter - * produces either more data for values >1.0, less data for values < 1.0, or - * no data change for a value of 1.0 - * @return new byte array with converted PCM data - */ - static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, float pitch, float speed) { - return applyPitchAndSpeed(monoPcmData, SAMPLE_RATE_WAV, pitch, speed); - } - /** * Converts given float values to 16bits PCM. No resampling or interpolation is done. * Floats are rounded to the nearest integer. @@ -284,10 +264,10 @@ static public byte[] pcmFloatTo16BitPCMWithDither(float[] pcmFloats, float norma return outBuf; } - static public byte[] generatePcmSilence(float duration) { + static public byte[] generatePcmSilence(float duration, int sampleRate) { final int nChannels = 1; final int nBits = 16; - final int nSamples = (int) (duration * SAMPLE_RATE_WAV); + final int nSamples = (int) (duration * sampleRate); final int nBytes = nSamples * nChannels * nBits / 8; return new byte[nBytes]; } diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java index 8cfadc12..f2a71cbd 100644 --- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java +++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java @@ -1,8 +1,11 @@ package com.grammatek.simaromur.device; import static com.grammatek.simaromur.cache.AudioFormat.AUDIO_FMT_PCM; +import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_11KHZ; import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_16KHZ; import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_22KHZ; +import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_44_1KHZ; +import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_48KHZ; import android.media.AudioFormat; import android.util.Log; @@ -42,7 +45,7 @@ public class TTSEngineController { TTSEngine mEngine; final ExecutorService mExecutorService; Future mTaskFuture; // the currently enqueued task, might be executed by the executor service - final TTSAudioControl mTTSAudioControl16khz; + TTSAudioControl mTTSAudioControl; /** * Constructor @@ -56,8 +59,6 @@ public TTSEngineController(AssetVoiceManager avm, DownloadVoiceManager dvm) { mAVM = avm; mDVM = dvm; mCurrentVoice = null; - mTTSAudioControl16khz = new TTSAudioControl(AudioManager.SAMPLE_RATE_ONNX, - AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT); // we only need one thread per Audio setting mExecutorService = Executors.newSingleThreadExecutor(); } @@ -84,6 +85,8 @@ public void LoadEngine(Voice voice) throws IOException { Log.v(LOG_TAG, "LoadEngine: " + devVoice.Type); try { mEngine = new TTSEngineOnnx(App.getContext().getAssets(), devVoice); + mTTSAudioControl = new TTSAudioControl(mEngine.GetNativeSampleRate(), + AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT); mCurrentVoice = devVoice; } catch (IllegalArgumentException e) { Log.e(LOG_TAG, "LoadEngine: " + e.getMessage()); @@ -115,9 +118,9 @@ public void UnloadEngine() { * Start to speak given text with given voice. */ synchronized - public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sampleRate, + public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, TTSAudioControl.AudioFinishedObserver observer, TTSRequest ttsRequest) { - if (mEngine == null || mCurrentVoice == null) { + if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) { String errorMsg = "No TTS engine loaded !"; Log.e(LOG_TAG, errorMsg); throw new RuntimeException(errorMsg); @@ -128,7 +131,7 @@ public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sample mTaskFuture.cancel(true); } Log.v(LOG_TAG, "StartSpeak: scheduling new SpeakTask (1)"); - SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, sampleRate, observer, mCurrentVoice, ttsRequest); + SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, observer, mCurrentVoice, ttsRequest); mTaskFuture = mExecutorService.submit(speakTask); return speakTask; } @@ -159,7 +162,12 @@ public void StartSpeak(TTSObserver observer, TTSRequest ttsRequest) { */ synchronized public void StopSpeak(TTSEngineController.SpeakTask speakTask) { - mTTSAudioControl16khz.stop(); + if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) { + String errorMsg = "StopSpeak(): No TTS engine loaded !"; + Log.e(LOG_TAG, errorMsg); + throw new RuntimeException(errorMsg); + } + mTTSAudioControl.stop(); if (speakTask != null) { speakTask.stopSynthesis(); } @@ -175,7 +183,6 @@ public class SpeakTask implements Runnable { CacheItem item; float speed; float pitch; - int sampleRate; TTSObserver observer; TTSAudioControl.AudioFinishedObserver audioObserver; boolean isStopped = false; @@ -189,10 +196,9 @@ public class SpeakTask implements Runnable { * speed * @param pitch pitch multiplier of voice, how many times higher/lower than normal voice * pitch - * @param sampleRate sample rate to use for the synthesis * @param ttsRequest request to be used for the synthesis */ - public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate, + public SpeakTask(String itemUuid, float speed, float pitch, TTSAudioControl.AudioFinishedObserver audioObserver, DeviceVoice voice, TTSRequest ttsRequest) { this.ttsRequest = ttsRequest; @@ -200,7 +206,6 @@ public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate, this.item = optItem.orElse(null); this.speed = speed; this.pitch = pitch; - this.sampleRate = sampleRate; this.audioObserver = audioObserver; this.observer = null; this.voice = voice; @@ -222,7 +227,6 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice) this.observer = observer; this.speed = observer.getSpeed(); this.pitch = observer.getPitch(); - this.sampleRate = mEngine.GetNativeSampleRate(); this.voice = voice; } @@ -233,7 +237,11 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice) */ public void run() { Log.v(LOG_SPEAK_TASK_TAG, "run() called"); - assert(sampleRate == mEngine.GetNativeSampleRate()); + if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) { + String errorMsg = "run(): No TTS engine loaded !"; + Log.e(LOG_TAG, errorMsg); + throw new RuntimeException(errorMsg); + } if (shouldStop()) { Log.v(LOG_SPEAK_TASK_TAG, "run(): shouldStop(1): true"); @@ -278,10 +286,8 @@ public void run() { if (observer == null) { // TODO: also the media players should stop, if item has changed: // - pass the cache item along - byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, sampleRate, pitch, speed); - if (sampleRate == AudioManager.SAMPLE_RATE_ONNX) { - mTTSAudioControl16khz.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver)); - } + byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, mEngine.GetNativeSampleRate(), pitch, speed); + mTTSAudioControl.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver)); } else { observer.update(audioData, ttsRequest); } @@ -314,13 +320,27 @@ private byte[] synthesizeSpeech(PhonemeEntry phonemeEntry) { private boolean saveAudioToCacheEntry(PhonemeEntry phonemeEntry, byte[] bytes) { SampleRate sampleRate; - if (mEngine.GetNativeSampleRate() == 22050) { - sampleRate = SAMPLE_RATE_22KHZ; - } else if (mEngine.GetNativeSampleRate() == 16000) { - sampleRate = SAMPLE_RATE_16KHZ; - } else { - throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate()); + switch(mEngine.GetNativeSampleRate()) + { + case 11025: + sampleRate = SAMPLE_RATE_11KHZ; + break; + case 16000: + sampleRate = SAMPLE_RATE_16KHZ; + break; + case 22050: + sampleRate = SAMPLE_RATE_22KHZ; + break; + case 44100: + sampleRate = SAMPLE_RATE_44_1KHZ; + break; + case 48000: + sampleRate = SAMPLE_RATE_48KHZ; + break; + default: + throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate()); } + final VoiceAudioDescription vad = UtteranceCacheManager.newAudioDescription(AUDIO_FMT_PCM, sampleRate, bytes.length, mCurrentVoice.InternalName, mCurrentVoice.Version); if (bytes.length == 0) { diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java index 5d27c8df..c803c5f9 100644 --- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java +++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java @@ -35,13 +35,13 @@ public class TTSEngineOnnx implements TTSEngine { private final static String LOG_TAG = "Simaromur_" + TTSEngineOnnx.class.getSimpleName(); - private final static int SAMPLE_RATE = 16000; private final static float SENTENCE_PAUSE = 0.5f; private static DeviceVoice sVoice = null; // matches a position preceded by any of the characters '.!?;' not followed by zero or // more whitespace characters ([\\s]*) and then a double quote (\"). - final static String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")";; + final static String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")"; + final byte[] mPauseSilence; private OrtEnvironment mOrtEnv; private OrtSession mOrtSession; private VitsConfig mModelConfig; @@ -82,6 +82,13 @@ public TTSEngineOnnx(AssetManager asm, DeviceVoice voice) { } mPhoneConverter = new VitsPhoneConverter(mModelConfig.phonemeIdMap); + // check if sample rate of model is in a valid range between 11kHz and 48kHz + if (mModelConfig.audio.sampleRate < 11025 || mModelConfig.audio.sampleRate > 48000) { + throw new RuntimeException("Voice " + voice.Name + ": invalid sample rate " + + mModelConfig.audio.sampleRate + " Hz"); + } + mPauseSilence = AudioManager.generatePcmSilence(SENTENCE_PAUSE, GetNativeSampleRate()); + Log.v(LOG_TAG, "Onnx model loaded from assets/" + modelPath); sVoice = voice; } @@ -146,7 +153,6 @@ public byte[] SpeakToPCM(String ipas) { Instant startTime = Instant.now(); List pcmList = new ArrayList<>(); - byte[] silence = AudioManager.generatePcmSilence(SENTENCE_PAUSE); List sentences = new ArrayList<>(); // split ipa's to sentences by splitting at punctuation; we also need to @@ -159,7 +165,7 @@ public byte[] SpeakToPCM(String ipas) { pcmList.add(pcmSentence); generatedPcmLength += pcmSentence.length; // add silence after each sentence, as the voice doesn't have any pauses - pcmList.add(silence); + pcmList.add(mPauseSilence); } // remove the last silence again pcmList.remove(pcmList.size()-1); @@ -231,7 +237,7 @@ private byte[] speakSentenceToPCM(String ipas) { @Override public int GetNativeSampleRate() { - return SAMPLE_RATE; + return mModelConfig.audio.sampleRate; } public static class VitsPhoneConverter { diff --git a/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java b/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java index 01ced6e6..30df0d49 100644 --- a/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java +++ b/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java @@ -8,15 +8,14 @@ import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_16KHZ; import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_22KHZ; import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_44_1KHZ; +import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_48KHZ; import android.util.Log; - import androidx.annotation.NonNull; import com.grammatek.simaromur.App; import com.grammatek.simaromur.AppRepository; import com.grammatek.simaromur.TTSRequest; -import com.grammatek.simaromur.TTSService; import com.grammatek.simaromur.audio.AudioObserver; import com.grammatek.simaromur.cache.AudioFormat; import com.grammatek.simaromur.cache.CacheItem; @@ -25,7 +24,6 @@ import com.grammatek.simaromur.cache.Utterance; import com.grammatek.simaromur.cache.UtteranceCacheManager; import com.grammatek.simaromur.cache.VoiceAudioDescription; -import com.grammatek.simaromur.db.VoiceDao; import com.grammatek.simaromur.network.api.pojo.SpeakRequest; import org.jetbrains.annotations.NotNull; @@ -103,17 +101,45 @@ public byte[] speak(SpeakRequest request) throws IOException { byte[] voiceAudio = null; if (response.isSuccessful()) { - ResponseBody body = response.body(); - assert body != null; - voiceAudio = body.bytes(); - Log.v(LOG_TAG, "API returned data of size: " + voiceAudio.length); + try (ResponseBody body = response.body()) { + if (body != null) { + voiceAudio = body.bytes(); + Log.v(LOG_TAG, "API returned data of size: " + voiceAudio.length); + } else { + Log.e(LOG_TAG, "API Error: no audio data returned"); + } + } catch (IOException e) { + Log.e(LOG_TAG, "Exception: " + e.getMessage()); + e.printStackTrace(); + } } else { - Log.e(LOG_TAG, "API Error: " + response.errorBody()); + String errMsg; + try (ResponseBody errorBody = response.errorBody()) { + if (errorBody != null) { + errMsg = errorBody.string(); + Log.e(LOG_TAG, "speak(): API Error: " + errMsg); + } else { + Log.e(LOG_TAG, "speak(): API Error: unknown error reason"); + } + } catch (IOException e) { + e.printStackTrace(); + errMsg = e.getMessage(); + Log.e(LOG_TAG, "speak(): Error occurred: " + errMsg); + } } return voiceAudio; } + /** + * Returns the native sample rate of the voice model used for the network speak request. + * + * @return native sample rate of the voice model + */ + public int getNativeSampleRate() { + return 22050; + } + /** * Builds a Retrofit caller object for the Network API without calling its endpoint yet. * @@ -150,17 +176,20 @@ public synchronized void onResponse(@NotNull Call call, Response call, Response call, Response