diff --git a/app/src/main/java/com/grammatek/simaromur/AppRepository.java b/app/src/main/java/com/grammatek/simaromur/AppRepository.java
index 9347b63c..c472f24d 100644
--- a/app/src/main/java/com/grammatek/simaromur/AppRepository.java
+++ b/app/src/main/java/com/grammatek/simaromur/AppRepository.java
@@ -145,7 +145,7 @@ public boolean isCurrentVoice(Voice voice) {
      */
     public void downloadVoiceAsync(Voice voice, DownloadVoiceManager.Observer finishedObserver) {
         // when the download is successful, the voice is updated in the database. This happens
-        // asynchonously.
+        // asynchronously.
         mDVM.downloadVoiceAsync(voice, finishedObserver, mVoiceDao);
     }
 
@@ -466,10 +466,10 @@ public void startNetworkTTS(Voice voice, CacheItem item, TTSRequest ttsRequest,
         Log.v(LOG_TAG, "startNetworkTTS: " + item.getUuid());
         // map given voice to voiceId
         if (voice != null) {
-            final TTSObserver ttsObserver = new TTSObserver(pitch, speed, AudioManager.SAMPLE_RATE_WAV);
+            final TTSObserver ttsObserver = new TTSObserver(pitch, speed, mNetworkSpeakController.getNativeSampleRate());
             if (playIfAudioCacheHit(voice.internalName, voice.version, item, ttsObserver, ttsRequest)) return;
 
-            final String SampleRate = "" + AudioManager.SAMPLE_RATE_WAV;
+            final String SampleRate = "" + mNetworkSpeakController.getNativeSampleRate();
             final String normalized = item.getUtterance().getNormalized();
             if (normalized.trim().isEmpty()) {
                 Log.w(LOG_TAG, "startNetworkTTS: given text is whitespace only ?!");
@@ -529,8 +529,7 @@ public TTSEngineController.SpeakTask startDeviceSpeak(Voice voice, CacheItem ite
             e.printStackTrace();
             return null;
         }
-        return mTTSEngineController.StartSpeak(item, speed, pitch,
-                mTTSEngineController.getEngine().GetNativeSampleRate(), observer, getCurrentTTsRequest());
+        return mTTSEngineController.StartSpeak(item, speed, pitch, observer, getCurrentTTsRequest());
     }
 
     /**
@@ -587,6 +586,10 @@ public String getVersionOfVoice(String internalVoiceName) {
         return null;
     }
 
+    public int getVoiceNativeSampleRate() {
+        return mTTSEngineController.getEngine().GetNativeSampleRate();
+    }
+
     /**
      * Find if we have the specified language available.
      * Use our DB model to query availability of voices
@@ -795,6 +798,7 @@ public void showTtsBackendWarningDialog(Context context) {
      */
     public void speakAssetFile(SynthesisCallback callback, String assetFilename) {
         Log.v(LOG_TAG, "playAssetFile: " + assetFilename);
+        final int SAMPLE_RATE_ASSETS = 22050;
         try {
             InputStream inputStream = App.getContext().getAssets().open(assetFilename);
             int size = inputStream.available();
@@ -803,7 +807,7 @@ public void speakAssetFile(SynthesisCallback callback, String assetFilename) {
                 Log.w(LOG_TAG, "playAssetFile: not enough bytes ?");
             }
             // don't provide rawText: there are no speech marks to update
-            callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
+            callback.start(SAMPLE_RATE_ASSETS, AudioFormat.ENCODING_PCM_16BIT,
                     AudioManager.N_CHANNELS);
             feedBytesToSynthesisCallback(callback, buffer, "");
             callback.done();
@@ -850,7 +854,7 @@ public static void feedBytesToSynthesisCallback(SynthesisCallback callback, byte
             final int bytesConsumed = Math.min(maxBytes, bytesLeft);
             if (callback.hasStarted()) {
                 // this feeds audio data to the callback, which will then be consumed by the TTS
-                // client. In case the current utterance is stopped(), all remaining audio data is
+                // client. In case the current utterance is stopped, all remaining audio data is
                 // consumed and discarded and afterwards TTSService.onStopped() is executed.
                 int cbStatus = callback.audioAvailable(buffer, offset, bytesConsumed);
                 switch(cbStatus) {
diff --git a/app/src/main/java/com/grammatek/simaromur/TTSService.java b/app/src/main/java/com/grammatek/simaromur/TTSService.java
index 625a99f6..2f30d482 100644
--- a/app/src/main/java/com/grammatek/simaromur/TTSService.java
+++ b/app/src/main/java/com/grammatek/simaromur/TTSService.java
@@ -20,7 +20,6 @@
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Optional;
 import java.util.Set;
 
@@ -109,7 +108,7 @@ protected int onLoadLanguage(String language, String country, String variant) {
      * waiting for a TTSProcessingResult inside onSynthesizeText(). If afterwards the audio processing is
      * finished, the processing result is received and discarded, because the current utterance is
      * already finished and has changed.
-     *
+     * <p>
      * Note:  mandatory, don't synchronize this method !
      */
     @Override
@@ -156,7 +155,7 @@ protected void onSynthesizeText(SynthesisRequest request,
                 loadedVoiceName = mRepository.getLoadedVoiceName();
             } else {
                 Log.w(LOG_TAG, "onSynthesizeText: couldn't load voice ("+voiceNameToLoad+")");
-                callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
+                callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT,
                         AudioManager.N_CHANNELS);
                 callback.error(TextToSpeech.ERROR_SERVICE);
                 if (callback.hasStarted() && ! callback.hasFinished()) {
@@ -215,12 +214,12 @@ protected void onSynthesizeText(SynthesisRequest request,
                     Log.v(LOG_TAG, "onSynthesizeText: finished (" + item.getUuid() + ")");
                     return;
                 }
-                startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_WAV, true);
+                startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), true);
                 setSpeechMarksToBeginning(callback);
                 mRepository.startNetworkTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f);
                 break;
             case com.grammatek.simaromur.db.Voice.TYPE_ONNX:
-                startSynthesisCallback(callback, AudioManager.SAMPLE_RATE_ONNX, false);
+                startSynthesisCallback(callback, mRepository.getVoiceNativeSampleRate(), false);
                 setSpeechMarksToBeginning(callback);
                 mRepository.startDeviceTTS(voice, item, ttsRequest, speechrate / 100.0f, pitch / 100.0f);
                 break;
@@ -251,7 +250,7 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item,
                 // todo: we need to handle timeout errors here, e.g. processing
                 //       timeouts, some error, e.g. network timeouts are already taken care of
                 TTSProcessingResult elem = mRepository.dequeueTTSProcessingResult();
-                float rtf = estimateRTF(startTime, System.currentTimeMillis(), item, elem);
+                float rtf = estimateRTF(startTime, System.currentTimeMillis(), elem);
                 Log.v(LOG_TAG, "estimateRTF: rtf=" + rtf);
                 if (rtf > 500.0f && !isCached) {
                     Log.w(LOG_TAG, "handleProcessingResult: rtf > 500.0f, something went wrong for the estimation");
@@ -320,11 +319,10 @@ private void handleProcessingResult(SynthesisCallback callback, CacheItem item,
      *
      * @param startTimeMillis   time when the processing started
      * @param stopTimeMillis    time when the processing stopped
-     * @param item              cache item
      * @param elem              processing result
      * @return the real time factor
      */
-    private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem item, TTSProcessingResult elem) {
+    private float estimateRTF(long startTimeMillis, long stopTimeMillis, TTSProcessingResult elem) {
         String uuid = elem.getTTSRequest().getCacheItemUuid();
         Log.v(LOG_TAG, "estimateRTF for: " + uuid);
 
@@ -336,7 +334,7 @@ private float estimateRTF(long startTimeMillis, long stopTimeMillis, CacheItem i
         // assume currently slowest used sample rate, i.e. 16kHz and 16 bit with 1 channel
         // TODO: we should use the real sample rate here, but this needs to be passed via the
         //       TTSProcessingResult
-        final int sampleRate = AudioManager.SAMPLE_RATE_WAV;
+        final int sampleRate = mRepository.getVoiceNativeSampleRate();
         final int bytesPerSample = 2;
         final int channels = 1;
 
@@ -445,7 +443,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback,
 
     /**
      * Signal TTS client a TTS error with given error code.
-     *
+     * <p>
      * The sequence for signalling an error seems to be important: callback.start(),
      * callback.error(), callback.done(). Any callback.audioAvailable() call after a callback.error()
      * is ignored.
@@ -455,7 +453,7 @@ private boolean testForAndHandleNetworkVoiceIssues(SynthesisCallback callback,
      */
     private void signalTtsError(SynthesisCallback callback, int errorCode) {
         Log.w(LOG_TAG, "signalTtsError(): errorCode = " + errorCode);
-        callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
+        callback.start(mRepository.getVoiceNativeSampleRate(), AudioFormat.ENCODING_PCM_16BIT,
                 AudioManager.N_CHANNELS);
         callback.error(errorCode);
         callback.done();
@@ -467,12 +465,12 @@ private void signalTtsError(SynthesisCallback callback, int errorCode) {
      *
      * @param callback  TTS callback provided in the onSynthesizeText() callback
      */
-    private static void playSilence(SynthesisCallback callback) {
+    private void playSilence(SynthesisCallback callback) {
         Log.v(LOG_TAG, "playSilence() ...");
-        callback.start(AudioManager.SAMPLE_RATE_WAV, AudioFormat.ENCODING_PCM_16BIT,
-                    AudioManager.N_CHANNELS);
+        int sampleRate = mRepository.getVoiceNativeSampleRate();
+        callback.start(sampleRate, AudioFormat.ENCODING_PCM_16BIT, AudioManager.N_CHANNELS);
         setSpeechMarksToBeginning(callback);
-        byte[] silenceData = AudioManager.generatePcmSilence(0.25f);
+        byte[] silenceData = AudioManager.generatePcmSilence(0.25f, sampleRate);
         callback.audioAvailable(silenceData, 0, silenceData.length);
         if (! callback.hasFinished() && callback.hasStarted()) {
             callback.done();
diff --git a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
index 2443a247..e06cd21c 100644
--- a/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
+++ b/app/src/main/java/com/grammatek/simaromur/audio/AudioManager.java
@@ -16,9 +16,9 @@ public class AudioManager {
     private final static String LOG_TAG = "Simaromur_" + AudioManager.class.getSimpleName();
 
     // Some constants used throughout audio conversion
-    public static final int SAMPLE_RATE_WAV = 16000;
+    //public static final int SAMPLE_RATE_WAV = 16000;
     public static final int SAMPLE_RATE_MP3 = 22050;
-    public static final int SAMPLE_RATE_ONNX = 16000;
+    //public static final int SAMPLE_RATE_ONNX = 16000;
     //public static final int SAMPLE_RATE_ONNX = 22050;
     public static final int N_CHANNELS = 1;
 
@@ -79,26 +79,6 @@ static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, int sampleRate
         return outputConversionStream.toByteArray();
     }
 
-    /**
-     * Either apply pitch and speed to ttsData, resulting in a potentially differently sized output
-     * buffer, or simply copy ttsData to the new output buffer, if no changes of speed or pitch
-     * are requested.
-     * Return the newly created output buffer.
-     *
-     * @param monoPcmData byte array of MONO PCM data to be used as input data. 22050 Hz sample rate
-     *                    is expected
-     * @param pitch   pitch to be applied. 1.0f means no pitch change, values > 1.0 mean higher
-     *                pitch, values < 1.0 mean lower pitch than in given pcmData
-     * @param speed   speed to be applied. 1.0f means no speed change, values > 1.0 mean higher
-     *                speed, values < 1.0 mean lower speed than in given pcmData. This parameter
-     *                produces either more data for values >1.0, less data for values < 1.0, or
-     *                no data change for a value of 1.0
-     * @return new byte array with converted PCM data
-     */
-    static public byte[] applyPitchAndSpeed(final byte[] monoPcmData, float pitch, float speed) {
-        return applyPitchAndSpeed(monoPcmData, SAMPLE_RATE_WAV, pitch, speed);
-    }
-
     /**
      * Converts given float values to 16bits PCM. No resampling or interpolation is done.
      * Floats are rounded to the nearest integer.
@@ -284,10 +264,10 @@ static public byte[] pcmFloatTo16BitPCMWithDither(float[] pcmFloats, float norma
         return outBuf;
     }
 
-    static public byte[] generatePcmSilence(float duration) {
+    static public byte[] generatePcmSilence(float duration, int sampleRate) {
         final int nChannels = 1;
         final int nBits = 16;
-        final int nSamples = (int) (duration * SAMPLE_RATE_WAV);
+        final int nSamples = (int) (duration * sampleRate);
         final int nBytes = nSamples * nChannels * nBits / 8;
         return new byte[nBytes];
     }
diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
index 8cfadc12..f2a71cbd 100644
--- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
+++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineController.java
@@ -1,8 +1,11 @@
 package com.grammatek.simaromur.device;
 
 import static com.grammatek.simaromur.cache.AudioFormat.AUDIO_FMT_PCM;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_11KHZ;
 import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_16KHZ;
 import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_22KHZ;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_44_1KHZ;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_48KHZ;
 
 import android.media.AudioFormat;
 import android.util.Log;
@@ -42,7 +45,7 @@ public class TTSEngineController {
     TTSEngine mEngine;
     final ExecutorService mExecutorService;
     Future<?> mTaskFuture;  // the currently enqueued task, might be executed by the executor service
-    final TTSAudioControl mTTSAudioControl16khz;
+    TTSAudioControl mTTSAudioControl;
 
     /**
      * Constructor
@@ -56,8 +59,6 @@ public TTSEngineController(AssetVoiceManager avm, DownloadVoiceManager dvm) {
         mAVM = avm;
         mDVM = dvm;
         mCurrentVoice = null;
-        mTTSAudioControl16khz = new TTSAudioControl(AudioManager.SAMPLE_RATE_ONNX,
-                AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT);
         // we only need one thread per Audio setting
         mExecutorService = Executors.newSingleThreadExecutor();
     }
@@ -84,6 +85,8 @@ public void LoadEngine(Voice voice) throws IOException {
                     Log.v(LOG_TAG, "LoadEngine: " + devVoice.Type);
                     try {
                         mEngine = new TTSEngineOnnx(App.getContext().getAssets(), devVoice);
+                        mTTSAudioControl = new TTSAudioControl(mEngine.GetNativeSampleRate(),
+                                AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT);
                         mCurrentVoice = devVoice;
                     } catch (IllegalArgumentException e) {
                         Log.e(LOG_TAG, "LoadEngine: " + e.getMessage());
@@ -115,9 +118,9 @@ public void UnloadEngine() {
      * Start to speak given text with given voice.
      */
     synchronized
-    public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sampleRate,
+    public SpeakTask StartSpeak(CacheItem item, float speed, float pitch,
                            TTSAudioControl.AudioFinishedObserver observer, TTSRequest ttsRequest) {
-        if (mEngine == null || mCurrentVoice == null) {
+        if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
             String errorMsg = "No TTS engine loaded !";
             Log.e(LOG_TAG, errorMsg);
             throw new RuntimeException(errorMsg);
@@ -128,7 +131,7 @@ public SpeakTask StartSpeak(CacheItem item, float speed, float pitch, int sample
             mTaskFuture.cancel(true);
         }
         Log.v(LOG_TAG, "StartSpeak: scheduling new SpeakTask (1)");
-        SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, sampleRate, observer, mCurrentVoice, ttsRequest);
+        SpeakTask speakTask = new SpeakTask(item.getUuid(), speed, pitch, observer, mCurrentVoice, ttsRequest);
         mTaskFuture = mExecutorService.submit(speakTask);
         return speakTask;
     }
@@ -159,7 +162,12 @@ public void StartSpeak(TTSObserver observer, TTSRequest ttsRequest) {
      */
     synchronized
     public void StopSpeak(TTSEngineController.SpeakTask speakTask) {
-        mTTSAudioControl16khz.stop();
+        if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
+            String errorMsg = "StopSpeak(): No TTS engine loaded !";
+            Log.e(LOG_TAG, errorMsg);
+            throw new RuntimeException(errorMsg);
+        }
+        mTTSAudioControl.stop();
         if (speakTask != null) {
             speakTask.stopSynthesis();
         }
@@ -175,7 +183,6 @@ public class SpeakTask implements Runnable {
         CacheItem item;
         float speed;
         float pitch;
-        int sampleRate;
         TTSObserver observer;
         TTSAudioControl.AudioFinishedObserver audioObserver;
         boolean isStopped = false;
@@ -189,10 +196,9 @@ public class SpeakTask implements Runnable {
          *                      speed
          * @param pitch         pitch multiplier of voice, how many times higher/lower than normal voice
          *                      pitch
-         * @param sampleRate    sample rate to use for the synthesis
          * @param ttsRequest    request to be used for the synthesis
          */
-        public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate,
+        public SpeakTask(String itemUuid, float speed, float pitch,
                          TTSAudioControl.AudioFinishedObserver audioObserver, DeviceVoice voice,
                          TTSRequest ttsRequest) {
             this.ttsRequest = ttsRequest;
@@ -200,7 +206,6 @@ public SpeakTask(String itemUuid, float speed, float pitch, int sampleRate,
             this.item = optItem.orElse(null);
             this.speed = speed;
             this.pitch = pitch;
-            this.sampleRate = sampleRate;
             this.audioObserver = audioObserver;
             this.observer = null;
             this.voice = voice;
@@ -222,7 +227,6 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice)
             this.observer = observer;
             this.speed = observer.getSpeed();
             this.pitch = observer.getPitch();
-            this.sampleRate = mEngine.GetNativeSampleRate();
             this.voice = voice;
         }
 
@@ -233,7 +237,11 @@ public SpeakTask(TTSObserver observer, TTSRequest ttsRequest, DeviceVoice voice)
          */
         public void run() {
             Log.v(LOG_SPEAK_TASK_TAG, "run() called");
-            assert(sampleRate == mEngine.GetNativeSampleRate());
+            if (mEngine == null || mCurrentVoice == null || mTTSAudioControl == null) {
+                String errorMsg = "run(): No TTS engine loaded !";
+                Log.e(LOG_TAG, errorMsg);
+                throw new RuntimeException(errorMsg);
+            }
 
             if (shouldStop())  {
                 Log.v(LOG_SPEAK_TASK_TAG, "run(): shouldStop(1): true");
@@ -278,10 +286,8 @@ public void run() {
             if (observer == null) {
                 // TODO: also the media players should stop, if item has changed:
                 //       - pass the cache item along
-                byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, sampleRate, pitch, speed);
-                if (sampleRate == AudioManager.SAMPLE_RATE_ONNX) {
-                    mTTSAudioControl16khz.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver));
-                }
+                byte[] processedAudio = AudioManager.applyPitchAndSpeed(audioData, mEngine.GetNativeSampleRate(), pitch, speed);
+                mTTSAudioControl.play(new TTSAudioControl.AudioEntry(processedAudio, audioObserver));
             } else {
                 observer.update(audioData, ttsRequest);
             }
@@ -314,13 +320,27 @@ private byte[] synthesizeSpeech(PhonemeEntry phonemeEntry) {
 
         private boolean saveAudioToCacheEntry(PhonemeEntry phonemeEntry, byte[] bytes) {
             SampleRate sampleRate;
-            if (mEngine.GetNativeSampleRate() == 22050) {
-                sampleRate = SAMPLE_RATE_22KHZ;
-            } else if (mEngine.GetNativeSampleRate() == 16000) {
-                sampleRate = SAMPLE_RATE_16KHZ;
-            } else {
-                throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate());
+            switch(mEngine.GetNativeSampleRate())
+            {
+                case 11025:
+                    sampleRate = SAMPLE_RATE_11KHZ;
+                    break;
+                case 16000:
+                    sampleRate = SAMPLE_RATE_16KHZ;
+                    break;
+                case 22050:
+                    sampleRate = SAMPLE_RATE_22KHZ;
+                    break;
+                case 44100:
+                    sampleRate = SAMPLE_RATE_44_1KHZ;
+                    break;
+                case 48000:
+                    sampleRate = SAMPLE_RATE_48KHZ;
+                    break;
+                default:
+                    throw new IllegalStateException("Unknown sample rate: " + mEngine.GetNativeSampleRate());
             }
+
             final VoiceAudioDescription vad = UtteranceCacheManager.newAudioDescription(AUDIO_FMT_PCM,
                     sampleRate, bytes.length, mCurrentVoice.InternalName, mCurrentVoice.Version);
             if (bytes.length == 0) {
diff --git a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
index 5d27c8df..c803c5f9 100644
--- a/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
+++ b/app/src/main/java/com/grammatek/simaromur/device/TTSEngineOnnx.java
@@ -35,13 +35,13 @@
 
 public class TTSEngineOnnx  implements TTSEngine {
     private final static String LOG_TAG = "Simaromur_" + TTSEngineOnnx.class.getSimpleName();
-    private final static int SAMPLE_RATE = 16000;
     private final static float SENTENCE_PAUSE = 0.5f;
     private static DeviceVoice sVoice = null;
     // matches a position preceded by any of the characters '.!?;' not followed by zero or
     // more whitespace characters ([\\s]*) and then a double quote (\").
-    final static  String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")";;
+    final static  String SplitPunctuationSymbols = "(?<=[.!?;])(?![\\s]*\")";
 
+    final byte[] mPauseSilence;
     private OrtEnvironment mOrtEnv;
     private OrtSession mOrtSession;
     private VitsConfig mModelConfig;
@@ -82,6 +82,13 @@ public TTSEngineOnnx(AssetManager asm, DeviceVoice voice) {
         }
         mPhoneConverter = new VitsPhoneConverter(mModelConfig.phonemeIdMap);
 
+        // check if sample rate of model is in a valid range between 11kHz and 48kHz
+        if (mModelConfig.audio.sampleRate < 11025 || mModelConfig.audio.sampleRate > 48000) {
+            throw new RuntimeException("Voice " + voice.Name + ": invalid sample rate " +
+                    mModelConfig.audio.sampleRate + " Hz");
+        }
+        mPauseSilence = AudioManager.generatePcmSilence(SENTENCE_PAUSE, GetNativeSampleRate());
+
         Log.v(LOG_TAG, "Onnx model loaded from assets/" + modelPath);
         sVoice = voice;
     }
@@ -146,7 +153,6 @@ public byte[] SpeakToPCM(String ipas) {
         Instant startTime = Instant.now();
 
         List<byte[]> pcmList = new ArrayList<>();
-        byte[] silence = AudioManager.generatePcmSilence(SENTENCE_PAUSE);
         List<String> sentences = new ArrayList<>();
 
         // split ipa's to sentences by splitting at punctuation; we also need to
@@ -159,7 +165,7 @@ public byte[] SpeakToPCM(String ipas) {
             pcmList.add(pcmSentence);
             generatedPcmLength += pcmSentence.length;
             // add silence after each sentence, as the voice doesn't have any pauses
-            pcmList.add(silence);
+            pcmList.add(mPauseSilence);
         }
         // remove the last silence again
         pcmList.remove(pcmList.size()-1);
@@ -231,7 +237,7 @@ private byte[] speakSentenceToPCM(String ipas) {
 
     @Override
     public int GetNativeSampleRate() {
-        return SAMPLE_RATE;
+        return mModelConfig.audio.sampleRate;
     }
 
     public static class VitsPhoneConverter {
diff --git a/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java b/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java
index 01ced6e6..30df0d49 100644
--- a/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java
+++ b/app/src/main/java/com/grammatek/simaromur/network/api/SpeakController.java
@@ -8,15 +8,14 @@
 import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_16KHZ;
 import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_22KHZ;
 import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_44_1KHZ;
+import static com.grammatek.simaromur.cache.SampleRate.SAMPLE_RATE_48KHZ;
 
 import android.util.Log;
-
 import androidx.annotation.NonNull;
 
 import com.grammatek.simaromur.App;
 import com.grammatek.simaromur.AppRepository;
 import com.grammatek.simaromur.TTSRequest;
-import com.grammatek.simaromur.TTSService;
 import com.grammatek.simaromur.audio.AudioObserver;
 import com.grammatek.simaromur.cache.AudioFormat;
 import com.grammatek.simaromur.cache.CacheItem;
@@ -25,7 +24,6 @@
 import com.grammatek.simaromur.cache.Utterance;
 import com.grammatek.simaromur.cache.UtteranceCacheManager;
 import com.grammatek.simaromur.cache.VoiceAudioDescription;
-import com.grammatek.simaromur.db.VoiceDao;
 import com.grammatek.simaromur.network.api.pojo.SpeakRequest;
 
 import org.jetbrains.annotations.NotNull;
@@ -103,17 +101,45 @@ public byte[] speak(SpeakRequest request) throws IOException {
         byte[] voiceAudio = null;
 
         if (response.isSuccessful()) {
-            ResponseBody body = response.body();
-            assert body != null;
-            voiceAudio = body.bytes();
-            Log.v(LOG_TAG, "API returned data of size: " + voiceAudio.length);
+            try  (ResponseBody body = response.body()) {
+                if (body != null) {
+                    voiceAudio = body.bytes();
+                    Log.v(LOG_TAG, "API returned data of size: " + voiceAudio.length);
+                } else {
+                    Log.e(LOG_TAG, "API Error: no audio data returned");
+                }
+            } catch (IOException e) {
+                Log.e(LOG_TAG, "Exception: " + e.getMessage());
+                e.printStackTrace();
+            }
         }
         else {
-            Log.e(LOG_TAG, "API Error: " + response.errorBody());
+            String errMsg;
+            try (ResponseBody errorBody = response.errorBody()) {
+                if (errorBody != null) {
+                    errMsg = errorBody.string();
+                    Log.e(LOG_TAG, "speak(): API Error: " + errMsg);
+                } else {
+                    Log.e(LOG_TAG, "speak(): API Error: unknown error reason");
+                }
+            } catch (IOException e) {
+                e.printStackTrace();
+                errMsg = e.getMessage();
+                Log.e(LOG_TAG, "speak(): Error occurred: " + errMsg);
+            }
         }
         return voiceAudio;
     }
 
+    /**
+     * Returns the native sample rate of the voice model used for the network speak request.
+     *
+     * @return  native sample rate of the voice model
+     */
+    public int getNativeSampleRate() {
+        return 22050;
+    }
+
     /**
      * Builds a Retrofit caller object for the Network API without calling its endpoint yet.
      *
@@ -150,17 +176,20 @@ public synchronized void onResponse(@NotNull Call<ResponseBody> call, Response<R
                 mAudioObserver.error("Cannot deduct TTSRequest from network response", dummyRequest);
                 return;
             }
-            ResponseBody body = response.body();
-            assert (body != null);
-            try {
-                // @note: body.bytes() loads the whole response into memory
-                byte[] audioData = body.bytes();
-                Log.v(LOG_TAG, "API returned: " + audioData.length + " bytes for "
-                        + ttsRequest.serialize());
-                if (saveSpeechDataToCache(audioData, ttsRequest.getCacheItemUuid())) {
-                    mAudioObserver.update(audioData, ttsRequest);
+            try  (ResponseBody body = response.body()) {
+                if (body != null) {
+                    // @note: body.bytes() blocks & loads the whole response into memory
+                    byte[] audioData = body.bytes();
+                    Log.v(LOG_TAG, "API returned: " + audioData.length + " bytes for "
+                            + ttsRequest.serialize());
+                    if (saveSpeechDataToCache(audioData, ttsRequest.getCacheItemUuid())) {
+                        mAudioObserver.update(audioData, ttsRequest);
+                    } else {
+                        mAudioObserver.error("failed to save speech data", ttsRequest);
+                    }
                 } else {
-                    mAudioObserver.error("failed to save speech data", ttsRequest);
+                    Log.e(LOG_TAG, "API Error: no audio data returned");
+                    mAudioObserver.error("no audio data returned", ttsRequest);
                 }
             } catch (IOException e) {
                 Log.e(LOG_TAG, "Exception: " + e.getMessage());
@@ -169,10 +198,12 @@ public synchronized void onResponse(@NotNull Call<ResponseBody> call, Response<R
             }
         } else {
             String errMsg = null;
-            try {
-                if (response.errorBody() != null) {
-                    errMsg = response.errorBody().string();
+            try (ResponseBody errorBody = response.errorBody()) {
+                if (errorBody != null) {
+                    errMsg = errorBody.string();
                     Log.e(LOG_TAG, "API Error: " + errMsg);
+                } else {
+                    Log.e(LOG_TAG, "API Error: unknown error reason");
                 }
             } catch (IOException e) {
                 e.printStackTrace();
@@ -181,7 +212,7 @@ public synchronized void onResponse(@NotNull Call<ResponseBody> call, Response<R
             // we couldn't retrieve the corresponding uuid from the response header, therefore we
             // need to use a dummy cache item uuid
             TTSRequest dummyRequest = new TTSRequest(AudioObserver.DUMMY_CACHEITEM_UUID);
-            Log.e(LOG_TAG, "onResponse: error occured: " + errMsg);
+            Log.e(LOG_TAG, "onResponse: error occurred: " + errMsg);
             mAudioObserver.error(errMsg, dummyRequest);
         }
         mCall = null;
@@ -229,7 +260,6 @@ private boolean saveSpeechDataToCache(byte[] data, String uuid) {
                             Log.e(LOG_TAG, "Couldn't add audio to cache item " + item.getUuid());
                         }
                     }
-
                 } else {
                     Log.e(LOG_TAG, "onResponse(): No phonemes found in cache item "
                             + uuid + " ?!");
@@ -264,6 +294,9 @@ private AudioFormat getAudioFormat() {
     private SampleRate getSampleRate() {
         SampleRate sampleRate = INVALID_SAMPLE_RATE;
         switch (mRequest.SampleRate) {
+            case "48000":
+                sampleRate = SAMPLE_RATE_48KHZ;
+                break;
             case "44100":
                 sampleRate = SAMPLE_RATE_44_1KHZ;
                 break;
diff --git a/app/src/main/proto/utterance_cache.proto b/app/src/main/proto/utterance_cache.proto
index 75d9996b..704b7b3a 100644
--- a/app/src/main/proto/utterance_cache.proto
+++ b/app/src/main/proto/utterance_cache.proto
@@ -27,6 +27,7 @@ enum SampleRate {
     SAMPLE_RATE_16KHZ = 2;
     SAMPLE_RATE_22KHZ = 3;
     SAMPLE_RATE_44_1KHZ = 4;
+    SAMPLE_RATE_48KHZ = 5;
 }
 
 // Describes the voice audio for a specific phoneme