diff --git a/FileGallery.jpg b/FileGallery.jpg
index d73a05243..8096f4405 100644
Binary files a/FileGallery.jpg and b/FileGallery.jpg differ
diff --git a/README.md b/README.md
index ebf0a75f7..51a47f1dd 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,9 @@
 
 <b>Инструкция по сборке:</b>
 Требуется:
-  1) Android Studio Flamingo (2022.2.1) или выше. Kotlin 1.8.*
+  1) Android Studio Giraffe (2022.3.1) или выше. Kotlin 1.9.*
   2) Android SDK 33
-  3) Android NDK 25.1.8937393
+  3) Android NDK 25.2.9519653
   
   Если не работает музыка в Fenrir Kate, обновите kate_receipt_gms_token в app.build_config.
   Взять токен можно из Kate Mobile Extra Mod
diff --git a/app_fenrir/build.gradle b/app_fenrir/build.gradle
index 67f4bff3d..1aa88212e 100644
--- a/app_fenrir/build.gradle
+++ b/app_fenrir/build.gradle
@@ -52,7 +52,7 @@ android {
         checkReleaseBuilds = true
     }
 
-    tasks.withType(JavaCompile) {
+    tasks.withType(JavaCompile).configureEach {
         options.compilerArgs << "-Xmaxwarns" << "1000" << "-Xmaxerrs" << "1000"
     }
 
@@ -76,7 +76,6 @@ android {
         release {
             minifyEnabled = false
             shrinkResources = false
-            zipAlignEnabled = true
         }
         debug {
             minifyEnabled = false
@@ -84,7 +83,7 @@ android {
         }
     }
 
-    flavorDimensions "type"
+    flavorDimensions.add("type")
     productFlavors {
         fenrir {
             applicationId = "dev.ragnarok.fenrir"
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt
index 61ac5a044..d030ddcc9 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt
@@ -36,6 +36,7 @@ import dev.ragnarok.fenrir.kJson
 import dev.ragnarok.fenrir.longpoll.LongpollInstance
 import dev.ragnarok.fenrir.model.Account
 import dev.ragnarok.fenrir.model.IOwnersBundle
+import dev.ragnarok.fenrir.model.MessageStatus
 import dev.ragnarok.fenrir.model.SaveAccount
 import dev.ragnarok.fenrir.model.User
 import dev.ragnarok.fenrir.model.criteria.DialogsCriteria
@@ -507,13 +508,19 @@ class AccountsPresenter(savedInstanceState: Bundle?) :
                             val dialogsJsonElem =
                                 i.jsonObject["conversation"]?.jsonArray ?: continue
                             if (!dialogsJsonElem.isEmpty()) {
-                                Includes.stores.dialogs().insertDialogs(
-                                    aid, kJson.decodeFromJsonElement(
-                                        ListSerializer(
-                                            DialogDboEntity.serializer()
-                                        ), dialogsJsonElem
-                                    ), true
-                                ).blockingAwait()
+                                val btmp = kJson.decodeFromJsonElement(
+                                    ListSerializer(
+                                        DialogDboEntity.serializer()
+                                    ), dialogsJsonElem
+                                )
+                                if (btmp.nonNullNoEmpty()) {
+                                    for (o in btmp) {
+                                        o.message?.setStatus(MessageStatus.SENT)
+                                    }
+                                    Includes.stores.dialogs().insertDialogs(
+                                        aid, btmp, true
+                                    ).blockingAwait()
+                                }
                             }
                         }
                     }
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt
index 6c35aec93..da02b5a12 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt
@@ -204,8 +204,10 @@ abstract class RecyclerBindableAdapter<T, VH : RecyclerView.ViewHolder>(private
         }
 
         //empty out our FrameLayout and replace with our header/footer
-        (vh.itemView as ViewGroup).removeAllViews()
-        vh.itemView.addView(view)
+        (vh.itemView as ViewGroup).let {
+            it.removeAllViews()
+            it.addView(view)
+        }
     }
 
     private fun isHeader(position: Int): Boolean {
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt
index ffec22acc..1ba67e6df 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt
@@ -6,6 +6,7 @@ import android.annotation.SuppressLint
 import android.app.Activity.RESULT_OK
 import android.app.Dialog
 import android.content.*
+import android.graphics.Bitmap
 import android.net.*
 import android.os.Build
 import android.os.Bundle
@@ -1570,7 +1571,12 @@ class ChatFragment : PlaceSupportMvpFragment<ChatPresenter, IChatView>(), IChatV
                                 it,
                                 Uri.fromFile(File(requireActivity().externalCacheDir.toString() + File.separator + "scale.jpg"))
                             )
-                                .withAspectRatio(1f, 1f)
+                                .withOptions(
+                                    UCrop.Options().withAspectRatio(1f, 1f)
+                                        .setCompressionQuality(100)
+                                        .setCompressionFormat(Bitmap.CompressFormat.JPEG)
+                                        .setHideBottomControls(false)
+                                )
                                 .getIntent(requireActivity())
                         }
                     )
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt
index a04509a42..8ac15af91 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt
@@ -92,7 +92,11 @@ class UserWallFragment : AbsWallFragment<IUserWallView, UserWallPresenter>(), IU
                             to_up,
                             Uri.fromFile(File(requireActivity().externalCacheDir.toString() + File.separator + "scale.jpg"))
                         )
-                            .withAspectRatio(1f, 1f)
+                            .withOptions(
+                                UCrop.Options().withAspectRatio(1f, 1f).setCompressionQuality(100)
+                                    .setCompressionFormat(Bitmap.CompressFormat.JPEG)
+                                    .setHideBottomControls(false)
+                            )
                             .getIntent(requireActivity())
                     )
                 }
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt
index 51a0ca6c8..351c8b1d2 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt
@@ -650,7 +650,7 @@ object NotificationHelper {
         return if (urit != null) Content(MimeType, urit) else null
     }
 
-    @SuppressLint("RestrictedApi")
+    @SuppressLint("RestrictedApi", "ReportShortcutUsage")
     private fun createNotificationShortcut(
         context: Context,
         builder: NotificationCompat.Builder,
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt
index f7c1f06a7..64e492f10 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt
@@ -10,6 +10,7 @@ import android.content.IntentFilter
 import android.content.ServiceConnection
 import android.os.IBinder
 import android.os.RemoteException
+import androidx.core.content.ContextCompat
 import dev.ragnarok.fenrir.R
 import dev.ragnarok.fenrir.model.Audio
 import dev.ragnarok.fenrir.settings.Settings
@@ -55,7 +56,12 @@ object MusicPlaybackController {
         filter.addAction(MusicPlaybackService.META_CHANGED)
         filter.addAction(MusicPlaybackService.PREPARED)
         filter.addAction(MusicPlaybackService.QUEUE_CHANGED)
-        appContext.registerReceiver(receiver, filter)
+        ContextCompat.registerReceiver(
+            appContext,
+            receiver,
+            filter,
+            ContextCompat.RECEIVER_NOT_EXPORTED
+        )
     }
 
     fun bindToServiceWithoutStart(
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt
index d6c538eeb..cd6ab7cde 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt
@@ -18,6 +18,7 @@ import android.support.v4.media.session.MediaControllerCompat
 import android.support.v4.media.session.MediaSessionCompat
 import android.support.v4.media.session.PlaybackStateCompat
 import android.util.Log
+import androidx.core.content.ContextCompat
 import androidx.media.session.MediaButtonReceiver
 import androidx.media3.common.AudioAttributes
 import androidx.media3.common.C
@@ -157,7 +158,12 @@ class MusicPlaybackService : Service() {
         filter.addAction(PREVIOUS_ACTION)
         filter.addAction(REPEAT_ACTION)
         filter.addAction(SHUFFLE_ACTION)
-        registerReceiver(mIntentReceiver, filter)
+        ContextCompat.registerReceiver(
+            this,
+            mIntentReceiver,
+            filter,
+            ContextCompat.RECEIVER_NOT_EXPORTED
+        )
 
         // Initialize the delayed shutdown intent
         val shutdownIntent = Intent(this, MusicPlaybackService::class.java)
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt
index 38633a3b7..60aeb74e1 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt
@@ -1,5 +1,6 @@
 package dev.ragnarok.fenrir.util
 
+import android.annotation.SuppressLint
 import android.annotation.TargetApi
 import android.content.Context
 import android.content.Intent
@@ -216,6 +217,7 @@ object ShortcutUtils {
     }
 
 
+    @SuppressLint("ReportShortcutUsage")
     @TargetApi(Build.VERSION_CODES.N_MR1)
     fun addDynamicShortcut(context: Context, accountId: Long, peer: Peer): Completable {
         val app = context.applicationContext
diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt
index 0fb1b232e..2a80e6a0e 100644
--- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt
+++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt
@@ -1002,7 +1002,7 @@ object Utils {
             return
         }
         if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
-            drawable.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE)
+            drawable.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY)
         } else {
             drawable.setColorFilter(color, PorterDuff.Mode.MULTIPLY)
         }
@@ -1013,7 +1013,7 @@ object Utils {
             return
         }
         if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
-            view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE)
+            view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY)
         } else {
             view.setColorFilter(color, PorterDuff.Mode.MULTIPLY)
         }
@@ -1024,7 +1024,7 @@ object Utils {
             return
         }
         if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
-            view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE)
+            view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY)
         } else {
             view.setColorFilter(color, PorterDuff.Mode.MULTIPLY)
         }
diff --git a/app_filegallery/build.gradle b/app_filegallery/build.gradle
index d0b5794b0..b64af2cf7 100644
--- a/app_filegallery/build.gradle
+++ b/app_filegallery/build.gradle
@@ -51,7 +51,7 @@ android {
         checkReleaseBuilds = true
     }
 
-    tasks.withType(JavaCompile) {
+    tasks.withType(JavaCompile).configureEach {
         options.compilerArgs << "-Xmaxwarns" << "1000" << "-Xmaxerrs" << "1000"
     }
 
@@ -75,7 +75,6 @@ android {
         release {
             minifyEnabled = false
             shrinkResources = false
-            zipAlignEnabled = true
         }
         debug {
             minifyEnabled = false
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt
index 60da9a43b..ff16202b1 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt
@@ -802,7 +802,7 @@ class PreferencesFragment : AbsPreferencesFragment(), PreferencesAdapter.OnScree
                                 ".SLOGAN"
                             ).absolutePath || sel.absolutePath == File(
                                 Environment.getExternalStorageDirectory(),
-                                ".OplusOS"
+                                "OplusOS"
                             ).absolutePath || sel.absolutePath == File(
                                 Environment.getExternalStorageDirectory(),
                                 ".time"
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt
index 20db5aa65..b99febcf6 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt
@@ -10,6 +10,7 @@ import android.content.IntentFilter
 import android.content.ServiceConnection
 import android.os.IBinder
 import android.os.RemoteException
+import androidx.core.content.ContextCompat
 import dev.ragnarok.filegallery.R
 import dev.ragnarok.filegallery.model.Audio
 import dev.ragnarok.filegallery.settings.Settings
@@ -56,7 +57,12 @@ object MusicPlaybackController {
         filter.addAction(MusicPlaybackService.META_CHANGED)
         filter.addAction(MusicPlaybackService.PREPARED)
         filter.addAction(MusicPlaybackService.QUEUE_CHANGED)
-        appContext.registerReceiver(receiver, filter)
+        ContextCompat.registerReceiver(
+            appContext,
+            receiver,
+            filter,
+            ContextCompat.RECEIVER_NOT_EXPORTED
+        )
     }
 
     fun bindToServiceWithoutStart(
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt
index 613383797..df560de11 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt
@@ -18,6 +18,7 @@ import android.support.v4.media.session.MediaControllerCompat
 import android.support.v4.media.session.MediaSessionCompat
 import android.support.v4.media.session.PlaybackStateCompat
 import android.util.Log
+import androidx.core.content.ContextCompat
 import androidx.media.session.MediaButtonReceiver
 import androidx.media3.common.AudioAttributes
 import androidx.media3.common.C
@@ -150,7 +151,12 @@ class MusicPlaybackService : Service() {
         filter.addAction(PREVIOUS_ACTION)
         filter.addAction(REPEAT_ACTION)
         filter.addAction(SHUFFLE_ACTION)
-        registerReceiver(mIntentReceiver, filter)
+        ContextCompat.registerReceiver(
+            this,
+            mIntentReceiver,
+            filter,
+            ContextCompat.RECEIVER_NOT_EXPORTED
+        )
 
         // Initialize the delayed shutdown intent
         val shutdownIntent = Intent(this, MusicPlaybackService::class.java)
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt
index dbaa92650..66f87e98c 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt
@@ -191,7 +191,7 @@ object Utils {
 
     fun setColorFilter(view: ImageView?, @ColorInt color: Int) {
         if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
-            view?.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE)
+            view?.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY)
         } else {
             view?.setColorFilter(color, PorterDuff.Mode.MULTIPLY)
         }
@@ -202,7 +202,7 @@ object Utils {
             return
         }
         if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
-            view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE)
+            view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY)
         } else {
             view.setColorFilter(color, PorterDuff.Mode.MULTIPLY)
         }
diff --git a/build.gradle b/build.gradle
index 6754a633e..3d256c3e1 100644
--- a/build.gradle
+++ b/build.gradle
@@ -41,7 +41,7 @@ buildscript {
     ext.tracingVersion = "1.2.0-rc01"
     ext.transitionVersion = "1.4.1"
     ext.vectordrawableVersion = "1.2.0-beta01"
-    ext.webkitVersion = "1.7.0-rc01"
+    ext.webkitVersion = "1.7.0"
     ext.workVersion = "2.8.1"
 
     //firebase libraries
@@ -57,7 +57,7 @@ buildscript {
     ext.autoValueVersion = "1.10.1"
 
     //common libraries
-    ext.kotlin_version = "1.8.21"
+    ext.kotlin_version = "1.9.0-Beta"
     ext.kotlin_coroutines = "1.7.1"
     ext.kotlin_serializer = "1.5.1"
     ext.okhttpLibraryVersion = "5.0.0-alpha.11"
@@ -66,7 +66,7 @@ buildscript {
     ext.guavaVersion = "32.0.0-android"
     ext.errorproneVersion = "2.15.0"
     ext.checkerCompatQualVersion = "2.5.5"
-    ext.checkerQualAndroidVersion = "3.34.0"
+    ext.checkerQualAndroidVersion = "3.35.0"
     ext.desugarLibraryVersion = "2.0.3"
 
     //APP_PROPS
@@ -91,7 +91,7 @@ buildscript {
         mavenCentral()
     }
     dependencies {
-        classpath "com.android.tools.build:gradle:8.0.2"
+        classpath "com.android.tools.build:gradle:8.1.0-beta04"
         classpath "com.google.gms:google-services:4.3.15"
         classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
         classpath "org.jetbrains.kotlin:kotlin-serialization:$kotlin_version"
@@ -106,6 +106,6 @@ allprojects {
     }
 }
 
-task clean(type: Delete) {
+tasks.register('clean', Delete) {
     delete rootProject.buildDir
 }
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
index e708b1c02..249e5832f 100644
Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradlew b/gradlew
index 4f906e0c8..a69d9cb6c 100644
--- a/gradlew
+++ b/gradlew
@@ -1,7 +1,7 @@
-#!/usr/bin/env sh
+#!/bin/sh
 
 #
-# Copyright 2015 the original author or authors.
+# Copyright © 2015-2021 the original authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,67 +17,101 @@
 #
 
 ##############################################################################
-##
-##  Gradle start up script for UN*X
-##
+#
+#   Gradle start up script for POSIX generated by Gradle.
+#
+#   Important for running:
+#
+#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+#       noncompliant, but you have some other compliant shell such as ksh or
+#       bash, then to run this script, type that shell name before the whole
+#       command line, like:
+#
+#           ksh Gradle
+#
+#       Busybox and similar reduced shells will NOT work, because this script
+#       requires all of these POSIX shell features:
+#         * functions;
+#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+#         * compound commands having a testable exit status, especially «case»;
+#         * various built-in commands including «command», «set», and «ulimit».
+#
+#   Important for patching:
+#
+#   (2) This script targets any POSIX shell, so it avoids extensions provided
+#       by Bash, Ksh, etc; in particular arrays are avoided.
+#
+#       The "traditional" practice of packing multiple parameters into a
+#       space-separated string is a well documented source of bugs and security
+#       problems, so this is (mostly) avoided, by progressively accumulating
+#       options in "$@", and eventually passing that to Java.
+#
+#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+#       see the in-line comments for details.
+#
+#       There are tweaks for specific operating systems such as AIX, CygWin,
+#       Darwin, MinGW, and NonStop.
+#
+#   (3) This script is generated from the Groovy template
+#       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+#       within the Gradle project.
+#
+#       You can find Gradle at https://github.com/gradle/gradle/.
+#
 ##############################################################################
 
 # Attempt to set APP_HOME
+
 # Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
+    [ -h "$app_path" ]
+do
+    ls=$( ls -ld "$app_path" )
+    link=${ls#*' -> '}
+    case $link in             #(
+      /*)   app_path=$link ;; #(
+      *)    app_path=$APP_HOME$link ;;
+    esac
 done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
 
 APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
+APP_BASE_NAME=${0##*/}
 
 # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 
 # Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
+MAX_FD=maximum
 
 warn () {
     echo "$*"
-}
+} >&2
 
 die () {
     echo
     echo "$*"
     echo
     exit 1
-}
+} >&2
 
 # OS specific support (must be 'true' or 'false').
 cygwin=false
 msys=false
 darwin=false
 nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
+case "$( uname )" in                #(
+  CYGWIN* )         cygwin=true  ;; #(
+  Darwin* )         darwin=true  ;; #(
+  MSYS* | MINGW* )  msys=true    ;; #(
+  NONSTOP* )        nonstop=true ;;
 esac
 
 CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
@@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 if [ -n "$JAVA_HOME" ] ; then
     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
         # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
+        JAVACMD=$JAVA_HOME/jre/sh/java
     else
-        JAVACMD="$JAVA_HOME/bin/java"
+        JAVACMD=$JAVA_HOME/bin/java
     fi
     if [ ! -x "$JAVACMD" ] ; then
         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
@@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the
 location of your Java installation."
     fi
 else
-    JAVACMD="java"
+    JAVACMD=java
     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 
 Please set the JAVA_HOME variable in your environment to match the
@@ -106,80 +140,101 @@ location of your Java installation."
 fi
 
 # Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+    case $MAX_FD in #(
+      max*)
+        MAX_FD=$( ulimit -H -n ) ||
+            warn "Could not query maximum file descriptor limit"
+    esac
+    case $MAX_FD in  #(
+      '' | soft) :;; #(
+      *)
+        ulimit -n "$MAX_FD" ||
+            warn "Could not set maximum file descriptor limit to $MAX_FD"
+    esac
 fi
 
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
+# Collect all arguments for the java command, stacking in reverse order:
+#   * args from the command line
+#   * the main class name
+#   * -classpath
+#   * -D...appname settings
+#   * --module-path (only if needed)
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
 
 # For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
+if "$cygwin" || "$msys" ; then
+    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+    JAVACMD=$( cygpath --unix "$JAVACMD" )
+
     # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
+    for arg do
+        if
+            case $arg in                                #(
+              -*)   false ;;                            # don't mess with options #(
+              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
+                    [ -e "$t" ] ;;                      #(
+              *)    false ;;
+            esac
+        then
+            arg=$( cygpath --path --ignore --mixed "$arg" )
         fi
-        i=`expr $i + 1`
+        # Roll the args list around exactly as many times as the number of
+        # args, so each arg winds up back in the position where it started, but
+        # possibly modified.
+        #
+        # NB: a `for` loop captures its iteration list before it begins, so
+        # changing the positional parameters here affects neither the number of
+        # iterations, nor the values presented in `arg`.
+        shift                   # remove old arg
+        set -- "$@" "$arg"      # push replacement arg
     done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
 fi
 
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
+# Collect all arguments for the java command;
+#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+#     shell script including quotes and variable substitutions, so put them in
+#     double quotes to make sure that they get re-expanded; and
+#   * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+        "-Dorg.gradle.appname=$APP_BASE_NAME" \
+        -classpath "$CLASSPATH" \
+        org.gradle.wrapper.GradleWrapperMain \
+        "$@"
+
+# Stop when "xargs" is not available.
+if ! command -v xargs >/dev/null 2>&1
+then
+    die "xargs is not available"
+fi
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+#   set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
 
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+eval "set -- $(
+        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+        xargs -n1 |
+        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+        tr '\n' ' '
+    )" '"$@"'
 
 exec "$JAVACMD" "$@"
diff --git a/gradlew.bat b/gradlew.bat
index 107acd32c..f127cfd49 100644
--- a/gradlew.bat
+++ b/gradlew.bat
@@ -14,7 +14,7 @@
 @rem limitations under the License.
 @rem
 
-@if "%DEBUG%" == "" @echo off
+@if "%DEBUG%"=="" @echo off
 @rem ##########################################################################
 @rem
 @rem  Gradle startup script for Windows
@@ -25,7 +25,7 @@
 if "%OS%"=="Windows_NT" setlocal
 
 set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
+if "%DIRNAME%"=="" set DIRNAME=.
 set APP_BASE_NAME=%~n0
 set APP_HOME=%DIRNAME%
 
@@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome
 
 set JAVA_EXE=java.exe
 %JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto execute
+if %ERRORLEVEL% equ 0 goto execute
 
 echo.
 echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
@@ -75,13 +75,15 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
 
 :end
 @rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
+if %ERRORLEVEL% equ 0 goto mainEnd
 
 :fail
 rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
 rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
+set EXIT_CODE=%ERRORLEVEL%
+if %EXIT_CODE% equ 0 set EXIT_CODE=1
+if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
+exit /b %EXIT_CODE%
 
 :mainEnd
 if "%OS%"=="Windows_NT" endlocal
diff --git a/image/src/main/AndroidManifest.xml b/image/src/main/AndroidManifest.xml
index 28cbe9a85..c3022eb47 100644
--- a/image/src/main/AndroidManifest.xml
+++ b/image/src/main/AndroidManifest.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android">
 
-    <application>
+    <application android:supportsRtl="false">
         <activity
             android:name="me.minetsh.imaging.IMGEditActivity"
             android:theme="@style/ImageEditTheme"
diff --git a/image/src/main/java/com/yalantis/ucrop/UCrop.java b/image/src/main/java/com/yalantis/ucrop/UCrop.java
index fbc447f7f..899a50fa2 100644
--- a/image/src/main/java/com/yalantis/ucrop/UCrop.java
+++ b/image/src/main/java/com/yalantis/ucrop/UCrop.java
@@ -4,6 +4,7 @@
 import android.content.Intent;
 import android.graphics.Bitmap;
 import android.net.Uri;
+import android.os.Build;
 import android.os.Bundle;
 import android.os.Parcelable;
 
@@ -47,7 +48,7 @@ public class UCrop {
     public static final String EXTRA_MAX_SIZE_Y = EXTRA_PREFIX + ".MaxSizeY";
 
     private final Intent mCropIntent;
-    private Bundle mCropOptionsBundle;
+    private final Bundle mCropOptionsBundle;
 
     private UCrop(@NonNull Uri source, @NonNull Uri destination) {
         mCropIntent = new Intent();
@@ -71,9 +72,15 @@ public static UCrop of(@NonNull Uri source, @NonNull Uri destination) {
      *
      * @param intent crop result intent
      */
+
+    @SuppressWarnings("deprecation")
     @Nullable
     public static Uri getOutput(@NonNull Intent intent) {
-        return intent.getParcelableExtra(EXTRA_OUTPUT_URI);
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
+            return intent.getParcelableExtra(EXTRA_OUTPUT_URI, Uri.class);
+        } else {
+            return intent.getParcelableExtra(EXTRA_OUTPUT_URI);
+        }
     }
 
     /**
@@ -110,9 +117,14 @@ public static float getOutputCropAspectRatio(@NonNull Intent intent) {
      * @param result crop result Intent
      * @return Throwable that could happen while image processing
      */
+    @SuppressWarnings("deprecation")
     @Nullable
     public static Throwable getError(@NonNull Intent result) {
-        return (Throwable) result.getSerializableExtra(EXTRA_ERROR);
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
+            return result.getSerializableExtra(EXTRA_ERROR, Throwable.class);
+        } else {
+            return (Throwable) result.getSerializableExtra(EXTRA_ERROR);
+        }
     }
 
     /**
@@ -174,20 +186,6 @@ public Intent getIntent(@NonNull Context context) {
         return mCropIntent;
     }
 
-    /**
-     * Get Fragment {@link UCropFragment}
-     *
-     * @return Fragment of {@link UCropFragment}
-     */
-    public UCropFragment getFragment() {
-        return UCropFragment.newInstance(mCropOptionsBundle);
-    }
-
-    public UCropFragment getFragment(Bundle bundle) {
-        mCropOptionsBundle = bundle;
-        return getFragment();
-    }
-
     /**
      * Class that helps to setup advanced configs that are not commonly used.
      * Use it with method {@link #withOptions(Options)}
@@ -250,24 +248,27 @@ public Bundle getOptionBundle() {
         /**
          * Set one of {@link android.graphics.Bitmap.CompressFormat} that will be used to save resulting Bitmap.
          */
-        public void setCompressionFormat(@NonNull Bitmap.CompressFormat format) {
+        public Options setCompressionFormat(@NonNull Bitmap.CompressFormat format) {
             mOptionBundle.putString(EXTRA_COMPRESSION_FORMAT_NAME, format.name());
+            return this;
         }
 
         /**
          * Set compression quality [0-100] that will be used to save resulting Bitmap.
          */
-        public void setCompressionQuality(@IntRange(from = 0) int compressQuality) {
+        public Options setCompressionQuality(@IntRange(from = 0) int compressQuality) {
             mOptionBundle.putInt(EXTRA_COMPRESSION_QUALITY, compressQuality);
+            return this;
         }
 
         /**
          * Choose what set of gestures will be enabled on each tab - if any.
          */
-        public void setAllowedGestures(@UCropActivity.GestureTypes int tabScale,
-                                       @UCropActivity.GestureTypes int tabRotate,
-                                       @UCropActivity.GestureTypes int tabAspectRatio) {
+        public Options setAllowedGestures(@UCropActivity.GestureTypes int tabScale,
+                                          @UCropActivity.GestureTypes int tabRotate,
+                                          @UCropActivity.GestureTypes int tabAspectRatio) {
             mOptionBundle.putIntArray(EXTRA_ALLOWED_GESTURES, new int[]{tabScale, tabRotate, tabAspectRatio});
+            return this;
         }
 
         /**
@@ -275,8 +276,9 @@ public void setAllowedGestures(@UCropActivity.GestureTypes int tabScale,
          *
          * @param maxScaleMultiplier - (minScale * maxScaleMultiplier) = maxScale
          */
-        public void setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false) float maxScaleMultiplier) {
+        public Options setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false) float maxScaleMultiplier) {
             mOptionBundle.putFloat(EXTRA_MAX_SCALE_MULTIPLIER, maxScaleMultiplier);
+            return this;
         }
 
         /**
@@ -284,8 +286,9 @@ public void setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false)
          *
          * @param durationMillis - duration in milliseconds
          */
-        public void setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int durationMillis) {
+        public Options setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int durationMillis) {
             mOptionBundle.putInt(EXTRA_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION, durationMillis);
+            return this;
         }
 
         /**
@@ -293,148 +296,169 @@ public void setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int dura
          *
          * @param maxBitmapSize - size in pixels
          */
-        public void setMaxBitmapSize(@IntRange(from = MIN_SIZE) int maxBitmapSize) {
+        public Options setMaxBitmapSize(@IntRange(from = MIN_SIZE) int maxBitmapSize) {
             mOptionBundle.putInt(EXTRA_MAX_BITMAP_SIZE, maxBitmapSize);
+            return this;
         }
 
         /**
          * @param color - desired color of dimmed area around the crop bounds
          */
-        public void setDimmedLayerColor(@ColorInt int color) {
+        public Options setDimmedLayerColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_DIMMED_LAYER_COLOR, color);
+            return this;
         }
 
         /**
          * @param isCircle - set it to true if you want dimmed layer to have an circle inside
          */
-        public void setCircleDimmedLayer(boolean isCircle) {
+        public Options setCircleDimmedLayer(boolean isCircle) {
             mOptionBundle.putBoolean(EXTRA_CIRCLE_DIMMED_LAYER, isCircle);
+            return this;
         }
 
         /**
          * @param show - set to true if you want to see a crop frame rectangle on top of an image
          */
-        public void setShowCropFrame(boolean show) {
+        public Options setShowCropFrame(boolean show) {
             mOptionBundle.putBoolean(EXTRA_SHOW_CROP_FRAME, show);
+            return this;
         }
 
         /**
          * @param color - desired color of crop frame
          */
-        public void setCropFrameColor(@ColorInt int color) {
+        public Options setCropFrameColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_CROP_FRAME_COLOR, color);
+            return this;
         }
 
         /**
          * @param width - desired width of crop frame line in pixels
          */
-        public void setCropFrameStrokeWidth(@IntRange(from = 0) int width) {
+        public Options setCropFrameStrokeWidth(@IntRange(from = 0) int width) {
             mOptionBundle.putInt(EXTRA_CROP_FRAME_STROKE_WIDTH, width);
+            return this;
         }
 
         /**
          * @param show - set to true if you want to see a crop grid/guidelines on top of an image
          */
-        public void setShowCropGrid(boolean show) {
+        public Options setShowCropGrid(boolean show) {
             mOptionBundle.putBoolean(EXTRA_SHOW_CROP_GRID, show);
+            return this;
         }
 
         /**
          * @param count - crop grid rows count.
          */
-        public void setCropGridRowCount(@IntRange(from = 0) int count) {
+        public Options setCropGridRowCount(@IntRange(from = 0) int count) {
             mOptionBundle.putInt(EXTRA_CROP_GRID_ROW_COUNT, count);
+            return this;
         }
 
         /**
          * @param count - crop grid columns count.
          */
-        public void setCropGridColumnCount(@IntRange(from = 0) int count) {
+        public Options setCropGridColumnCount(@IntRange(from = 0) int count) {
             mOptionBundle.putInt(EXTRA_CROP_GRID_COLUMN_COUNT, count);
+            return this;
         }
 
         /**
          * @param color - desired color of crop grid/guidelines
          */
-        public void setCropGridColor(@ColorInt int color) {
+        public Options setCropGridColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_CROP_GRID_COLOR, color);
+            return this;
         }
 
         /**
          * @param width - desired width of crop grid lines in pixels
          */
-        public void setCropGridStrokeWidth(@IntRange(from = 0) int width) {
+        public Options setCropGridStrokeWidth(@IntRange(from = 0) int width) {
             mOptionBundle.putInt(EXTRA_CROP_GRID_STROKE_WIDTH, width);
+            return this;
         }
 
         /**
          * @param color - desired resolved color of the toolbar
          */
-        public void setToolbarColor(@ColorInt int color) {
+        public Options setToolbarColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_TOOL_BAR_COLOR, color);
+            return this;
         }
 
         /**
          * @param color - desired resolved color of the statusbar
          */
-        public void setStatusBarColor(@ColorInt int color) {
+        public Options setStatusBarColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_STATUS_BAR_COLOR, color);
+            return this;
         }
 
         /**
          * @param color - desired resolved color of the active and selected widget and progress wheel middle line (default is white)
          */
-        public void setActiveControlsWidgetColor(@ColorInt int color) {
+        public Options setActiveControlsWidgetColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_UCROP_COLOR_CONTROLS_WIDGET_ACTIVE, color);
+            return this;
         }
 
         /**
          * @param color - desired resolved color of Toolbar text and buttons (default is darker orange)
          */
-        public void setToolbarWidgetColor(@ColorInt int color) {
+        public Options setToolbarWidgetColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_UCROP_WIDGET_COLOR_TOOLBAR, color);
+            return this;
         }
 
         /**
          * @param text - desired text for Toolbar title
          */
-        public void setToolbarTitle(@Nullable String text) {
+        public Options setToolbarTitle(@Nullable String text) {
             mOptionBundle.putString(EXTRA_UCROP_TITLE_TEXT_TOOLBAR, text);
+            return this;
         }
 
         /**
          * @param drawable - desired drawable for the Toolbar left cancel icon
          */
-        public void setToolbarCancelDrawable(@DrawableRes int drawable) {
+        public Options setToolbarCancelDrawable(@DrawableRes int drawable) {
             mOptionBundle.putInt(EXTRA_UCROP_WIDGET_CANCEL_DRAWABLE, drawable);
+            return this;
         }
 
         /**
          * @param drawable - desired drawable for the Toolbar right crop icon
          */
-        public void setToolbarCropDrawable(@DrawableRes int drawable) {
+        public Options setToolbarCropDrawable(@DrawableRes int drawable) {
             mOptionBundle.putInt(EXTRA_UCROP_WIDGET_CROP_DRAWABLE, drawable);
+            return this;
         }
 
         /**
          * @param color - desired resolved color of logo fill (default is darker grey)
          */
-        public void setLogoColor(@ColorInt int color) {
+        public Options setLogoColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_UCROP_LOGO_COLOR, color);
+            return this;
         }
 
         /**
          * @param hide - set to true to hide the bottom controls (shown by default)
          */
-        public void setHideBottomControls(boolean hide) {
+        public Options setHideBottomControls(boolean hide) {
             mOptionBundle.putBoolean(EXTRA_HIDE_BOTTOM_CONTROLS, hide);
+            return this;
         }
 
         /**
          * @param enabled - set to true to let user resize crop bounds (disabled by default)
          */
-        public void setFreeStyleCropEnabled(boolean enabled) {
+        public Options setFreeStyleCropEnabled(boolean enabled) {
             mOptionBundle.putBoolean(EXTRA_FREE_STYLE_CROP, enabled);
+            return this;
         }
 
         /**
@@ -443,7 +467,7 @@ public void setFreeStyleCropEnabled(boolean enabled) {
          * @param selectedByDefault - index of aspect ratio option that is selected by default (starts with 0).
          * @param aspectRatio       - list of aspect ratio options that are available to user
          */
-        public void setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRatio) {
+        public Options setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRatio) {
             if (selectedByDefault > aspectRatio.length) {
                 throw new IllegalArgumentException(String.format(Locale.US,
                         "Index [selectedByDefault = %d] cannot be higher than aspect ratio options count [count = %d].",
@@ -451,13 +475,15 @@ public void setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRa
             }
             mOptionBundle.putInt(EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, selectedByDefault);
             mOptionBundle.putParcelableArrayList(EXTRA_ASPECT_RATIO_OPTIONS, new ArrayList<Parcelable>(Arrays.asList(aspectRatio)));
+            return this;
         }
 
         /**
          * @param color - desired background color that should be applied to the root view
          */
-        public void setRootViewBackgroundColor(@ColorInt int color) {
+        public Options setRootViewBackgroundColor(@ColorInt int color) {
             mOptionBundle.putInt(EXTRA_UCROP_ROOT_VIEW_BACKGROUND_COLOR, color);
+            return this;
         }
 
         /**
@@ -467,18 +493,20 @@ public void setRootViewBackgroundColor(@ColorInt int color) {
          * @param x aspect ratio X
          * @param y aspect ratio Y
          */
-        public void withAspectRatio(float x, float y) {
+        public Options withAspectRatio(float x, float y) {
             mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_X, x);
             mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_Y, y);
+            return this;
         }
 
         /**
          * Set an aspect ratio for crop bounds that is evaluated from source image width and height.
          * User won't see the menu with other ratios options.
          */
-        public void useSourceImageAspectRatio() {
+        public Options useSourceImageAspectRatio() {
             mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_X, 0);
             mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_Y, 0);
+            return this;
         }
 
         /**
@@ -487,9 +515,10 @@ public void useSourceImageAspectRatio() {
          * @param width  max cropped image width
          * @param height max cropped image height
          */
-        public void withMaxResultSize(@IntRange(from = MIN_SIZE) int width, @IntRange(from = MIN_SIZE) int height) {
+        public Options withMaxResultSize(@IntRange(from = MIN_SIZE) int width, @IntRange(from = MIN_SIZE) int height) {
             mOptionBundle.putInt(EXTRA_MAX_SIZE_X, width);
             mOptionBundle.putInt(EXTRA_MAX_SIZE_Y, height);
+            return this;
         }
 
     }
diff --git a/image/src/main/java/com/yalantis/ucrop/UCropActivity.java b/image/src/main/java/com/yalantis/ucrop/UCropActivity.java
index 7661e9926..b7cc37c3a 100644
--- a/image/src/main/java/com/yalantis/ucrop/UCropActivity.java
+++ b/image/src/main/java/com/yalantis/ucrop/UCropActivity.java
@@ -7,6 +7,7 @@
 import android.graphics.drawable.Animatable;
 import android.graphics.drawable.Drawable;
 import android.net.Uri;
+import android.os.Build;
 import android.os.Bundle;
 import android.text.TextUtils;
 import android.util.Log;
@@ -175,7 +176,7 @@ public void onCreateMenu(@NonNull Menu menu, @NonNull MenuInflater menuInflater)
         Drawable menuItemCropIcon = ContextCompat.getDrawable(this, mToolbarCropDrawable);
         if (menuItemCropIcon != null) {
             menuItemCropIcon.mutate();
-            menuItemCropIcon.setColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP);
+            menuItemCropIcon.setColorFilter(new PorterDuffColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP));
             menuItemCrop.setIcon(menuItemCropIcon);
         }
     }
@@ -209,9 +210,17 @@ protected void onStop() {
     /**
      * This method extracts all data from the incoming intent and setups views properly.
      */
+    @SuppressWarnings("deprecation")
     private void setImageData(@NonNull Intent intent) {
-        Uri inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI);
-        Uri outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI);
+        Uri inputUri;
+        Uri outputUri;
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
+            inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI, Uri.class);
+            outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI, Uri.class);
+        } else {
+            inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI);
+            outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI);
+        }
         processOptions(intent);
 
         if (inputUri != null && outputUri != null) {
@@ -362,7 +371,7 @@ private void setupAppBar() {
 
         // Color buttons inside the Toolbar
         Drawable stateButtonDrawable = ContextCompat.getDrawable(this, mToolbarCancelDrawable).mutate();
-        stateButtonDrawable.setColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP);
+        stateButtonDrawable.setColorFilter(new PorterDuffColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP));
         toolbar.setNavigationIcon(stateButtonDrawable);
 
         setSupportActionBar(toolbar);
@@ -416,10 +425,15 @@ private void setStatusBarColor(@ColorInt int color) {
         }
     }
 
+    @SuppressWarnings("deprecation")
     private void setupAspectRatioWidget(@NonNull Intent intent) {
-
         int aspectRationSelectedByDefault = intent.getIntExtra(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0);
-        ArrayList<AspectRatio> aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS);
+        ArrayList<AspectRatio> aspectRatioList;
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
+            aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS, AspectRatio.class);
+        } else {
+            aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS);
+        }
 
         if (aspectRatioList == null || aspectRatioList.isEmpty()) {
             aspectRationSelectedByDefault = 2;
@@ -596,9 +610,9 @@ private void setWidgetState(@IdRes int stateViewId) {
     private void changeSelectedTab(int stateViewId) {
         TransitionManager.beginDelayedTransition(findViewById(R.id.ucrop_photobox), mControlsTransition);
 
-        mWrapperStateScale.findViewById(R.id.text_view_scale).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE);
+        mWrapperStateScale.findViewById(R.id.text_view_scale_info).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE);
         mWrapperStateAspectRatio.findViewById(R.id.text_view_crop).setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE);
-        mWrapperStateRotate.findViewById(R.id.text_view_rotate).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE);
+        mWrapperStateRotate.findViewById(R.id.text_view_rotate_info).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE);
 
     }
 
diff --git a/image/src/main/java/com/yalantis/ucrop/UCropFragment.java b/image/src/main/java/com/yalantis/ucrop/UCropFragment.java
deleted file mode 100644
index a6345d1ce..000000000
--- a/image/src/main/java/com/yalantis/ucrop/UCropFragment.java
+++ /dev/null
@@ -1,569 +0,0 @@
-package com.yalantis.ucrop;
-
-import static androidx.appcompat.app.AppCompatActivity.RESULT_OK;
-
-import android.content.Context;
-import android.content.Intent;
-import android.graphics.Bitmap;
-import android.graphics.PorterDuff;
-import android.net.Uri;
-import android.os.Bundle;
-import android.text.TextUtils;
-import android.view.LayoutInflater;
-import android.view.View;
-import android.view.ViewGroup;
-import android.view.animation.AccelerateInterpolator;
-import android.widget.FrameLayout;
-import android.widget.ImageView;
-import android.widget.LinearLayout;
-import android.widget.RelativeLayout;
-import android.widget.TextView;
-
-import androidx.annotation.ColorInt;
-import androidx.annotation.IdRes;
-import androidx.annotation.IntDef;
-import androidx.annotation.NonNull;
-import androidx.annotation.Nullable;
-import androidx.appcompat.app.AppCompatDelegate;
-import androidx.core.content.ContextCompat;
-import androidx.fragment.app.Fragment;
-import androidx.transition.AutoTransition;
-import androidx.transition.Transition;
-import androidx.transition.TransitionManager;
-
-import com.yalantis.ucrop.callback.BitmapCropCallback;
-import com.yalantis.ucrop.model.AspectRatio;
-import com.yalantis.ucrop.util.SelectedStateListDrawable;
-import com.yalantis.ucrop.view.CropImageView;
-import com.yalantis.ucrop.view.GestureCropImageView;
-import com.yalantis.ucrop.view.OverlayView;
-import com.yalantis.ucrop.view.TransformImageView;
-import com.yalantis.ucrop.view.UCropView;
-import com.yalantis.ucrop.view.widget.AspectRatioTextView;
-import com.yalantis.ucrop.view.widget.HorizontalProgressWheelView;
-
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-import me.minetsh.imaging.R;
-
-@SuppressWarnings("ConstantConditions")
-public class UCropFragment extends Fragment {
-
-    public static final int DEFAULT_COMPRESS_QUALITY = 90;
-    public static final Bitmap.CompressFormat DEFAULT_COMPRESS_FORMAT = Bitmap.CompressFormat.JPEG;
-
-    public static final int NONE = 0;
-    public static final int SCALE = 1;
-    public static final int ROTATE = 2;
-    public static final int ALL = 3;
-    public static final String TAG = "UCropFragment";
-    private static final long CONTROLS_ANIMATION_DURATION = 50;
-    private static final int TABS_COUNT = 3;
-    private static final int SCALE_WIDGET_SENSITIVITY_COEFFICIENT = 15000;
-    private static final int ROTATE_WIDGET_SENSITIVITY_COEFFICIENT = 42;
-
-    static {
-        AppCompatDelegate.setCompatVectorFromResourcesEnabled(true);
-    }
-
-    private final List<ViewGroup> mCropAspectRatioViews = new ArrayList<>();
-    UCropFragmentCallback callback;
-    UCropView mUCropView;
-    GestureCropImageView mGestureCropImageView;
-    View mBlockingView;
-    private int mActiveControlsWidgetColor;
-    @ColorInt
-    private int mRootViewBackgroundColor;
-    private int mLogoColor;
-    private boolean mShowBottomControls;
-    private Transition mControlsTransition;
-    private OverlayView mOverlayView;
-    private ViewGroup mWrapperStateAspectRatio, mWrapperStateRotate, mWrapperStateScale;
-    private ViewGroup mLayoutAspectRatio, mLayoutRotate, mLayoutScale;
-    private TextView mTextViewRotateAngle, mTextViewScalePercent;
-    private final TransformImageView.TransformImageListener mImageListener = new TransformImageView.TransformImageListener() {
-        @Override
-        public void onRotate(float currentAngle) {
-            setAngleText(currentAngle);
-        }
-
-        @Override
-        public void onScale(float currentScale) {
-            setScaleText(currentScale);
-        }
-
-        @Override
-        public void onLoadComplete() {
-            mUCropView.animate().alpha(1).setDuration(300).setInterpolator(new AccelerateInterpolator());
-            mBlockingView.setClickable(false);
-            callback.loadingProgress(false);
-        }
-
-        @Override
-        public void onLoadFailure(@NonNull Exception e) {
-            callback.onCropFinish(getError(e));
-        }
-
-    };
-    private Bitmap.CompressFormat mCompressFormat = DEFAULT_COMPRESS_FORMAT;
-    private int mCompressQuality = DEFAULT_COMPRESS_QUALITY;
-    private int[] mAllowedGestures = {SCALE, ROTATE, ALL};
-    private final View.OnClickListener mStateClickListener = v -> {
-        if (!v.isSelected()) {
-            setWidgetState(v.getId());
-        }
-    };
-
-    public static UCropFragment newInstance(Bundle uCrop) {
-        UCropFragment fragment = new UCropFragment();
-        fragment.setArguments(uCrop);
-        return fragment;
-    }
-
-    @Override
-    public void onAttach(@NonNull Context context) {
-        super.onAttach(context);
-        if (getParentFragment() instanceof UCropFragmentCallback)
-            callback = (UCropFragmentCallback) getParentFragment();
-        else if (context instanceof UCropFragmentCallback)
-            callback = (UCropFragmentCallback) context;
-        else
-            throw new IllegalArgumentException(context
-                    + " must implement UCropFragmentCallback");
-    }
-
-    public void setCallback(UCropFragmentCallback callback) {
-        this.callback = callback;
-    }
-
-    @Nullable
-    @Override
-    public View onCreateView(@NonNull LayoutInflater inflater, @Nullable ViewGroup container, @Nullable Bundle savedInstanceState) {
-        View rootView = inflater.inflate(R.layout.ucrop_fragment_photobox, container, false);
-
-        Bundle args = getArguments();
-
-        setupViews(rootView, args);
-        setImageData(args);
-        setInitialState();
-        addBlockingView(rootView);
-
-        return rootView;
-    }
-
-    public void setupViews(View view, Bundle args) {
-        mActiveControlsWidgetColor = args.getInt(UCrop.Options.EXTRA_UCROP_COLOR_CONTROLS_WIDGET_ACTIVE, ContextCompat.getColor(getContext(), R.color.ucrop_color_widget_active));
-        mLogoColor = args.getInt(UCrop.Options.EXTRA_UCROP_LOGO_COLOR, ContextCompat.getColor(getContext(), R.color.ucrop_color_default_logo));
-        mShowBottomControls = !args.getBoolean(UCrop.Options.EXTRA_HIDE_BOTTOM_CONTROLS, false);
-        mRootViewBackgroundColor = args.getInt(UCrop.Options.EXTRA_UCROP_ROOT_VIEW_BACKGROUND_COLOR, ContextCompat.getColor(getContext(), R.color.ucrop_color_crop_background));
-
-        initiateRootViews(view);
-        callback.loadingProgress(true);
-
-        if (mShowBottomControls) {
-
-            ViewGroup wrapper = view.findViewById(R.id.controls_wrapper);
-            wrapper.setVisibility(View.VISIBLE);
-            LayoutInflater.from(getContext()).inflate(R.layout.ucrop_controls, wrapper, true);
-
-            mControlsTransition = new AutoTransition();
-            mControlsTransition.setDuration(CONTROLS_ANIMATION_DURATION);
-
-            mWrapperStateAspectRatio = view.findViewById(R.id.state_aspect_ratio);
-            mWrapperStateAspectRatio.setOnClickListener(mStateClickListener);
-            mWrapperStateRotate = view.findViewById(R.id.state_rotate);
-            mWrapperStateRotate.setOnClickListener(mStateClickListener);
-            mWrapperStateScale = view.findViewById(R.id.state_scale);
-            mWrapperStateScale.setOnClickListener(mStateClickListener);
-
-            mLayoutAspectRatio = view.findViewById(R.id.layout_aspect_ratio);
-            mLayoutRotate = view.findViewById(R.id.layout_rotate_wheel);
-            mLayoutScale = view.findViewById(R.id.layout_scale_wheel);
-
-            setupAspectRatioWidget(args, view);
-            setupRotateWidget(view);
-            setupScaleWidget(view);
-            setupStatesWrapper(view);
-        } else {
-            RelativeLayout.LayoutParams params = (RelativeLayout.LayoutParams) view.findViewById(R.id.ucrop_frame).getLayoutParams();
-            params.bottomMargin = 0;
-            view.findViewById(R.id.ucrop_frame).requestLayout();
-        }
-    }
-
-    private void setImageData(@NonNull Bundle bundle) {
-        Uri inputUri = bundle.getParcelable(UCrop.EXTRA_INPUT_URI);
-        Uri outputUri = bundle.getParcelable(UCrop.EXTRA_OUTPUT_URI);
-        processOptions(bundle);
-
-        if (inputUri != null && outputUri != null) {
-            try {
-                mGestureCropImageView.setImageUri(inputUri, outputUri);
-            } catch (Exception e) {
-                callback.onCropFinish(getError(e));
-            }
-        } else {
-            callback.onCropFinish(getError(new NullPointerException(getString(R.string.ucrop_error_input_data_is_absent))));
-        }
-    }
-
-    /**
-     * This method extracts {@link com.yalantis.ucrop.UCrop.Options #optionsBundle} from incoming bundle
-     * and setups fragment, {@link OverlayView} and {@link CropImageView} properly.
-     */
-    @SuppressWarnings("deprecation")
-    private void processOptions(@NonNull Bundle bundle) {
-        // Bitmap compression options
-        String compressionFormatName = bundle.getString(UCrop.Options.EXTRA_COMPRESSION_FORMAT_NAME);
-        Bitmap.CompressFormat compressFormat = null;
-        if (!TextUtils.isEmpty(compressionFormatName)) {
-            compressFormat = Bitmap.CompressFormat.valueOf(compressionFormatName);
-        }
-        mCompressFormat = (compressFormat == null) ? DEFAULT_COMPRESS_FORMAT : compressFormat;
-
-        mCompressQuality = bundle.getInt(UCrop.Options.EXTRA_COMPRESSION_QUALITY, UCropActivity.DEFAULT_COMPRESS_QUALITY);
-
-        // Gestures options
-        int[] allowedGestures = bundle.getIntArray(UCrop.Options.EXTRA_ALLOWED_GESTURES);
-        if (allowedGestures != null && allowedGestures.length == TABS_COUNT) {
-            mAllowedGestures = allowedGestures;
-        }
-
-        // Crop image view options
-        mGestureCropImageView.setMaxBitmapSize(bundle.getInt(UCrop.Options.EXTRA_MAX_BITMAP_SIZE, CropImageView.DEFAULT_MAX_BITMAP_SIZE));
-        mGestureCropImageView.setMaxScaleMultiplier(bundle.getFloat(UCrop.Options.EXTRA_MAX_SCALE_MULTIPLIER, CropImageView.DEFAULT_MAX_SCALE_MULTIPLIER));
-        mGestureCropImageView.setImageToWrapCropBoundsAnimDuration(bundle.getInt(UCrop.Options.EXTRA_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION, CropImageView.DEFAULT_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION));
-
-        // Overlay view options
-        mOverlayView.setFreestyleCropEnabled(bundle.getBoolean(UCrop.Options.EXTRA_FREE_STYLE_CROP, OverlayView.DEFAULT_FREESTYLE_CROP_MODE != OverlayView.FREESTYLE_CROP_MODE_DISABLE));
-
-        mOverlayView.setDimmedColor(bundle.getInt(UCrop.Options.EXTRA_DIMMED_LAYER_COLOR, getResources().getColor(R.color.ucrop_color_default_dimmed)));
-        mOverlayView.setCircleDimmedLayer(bundle.getBoolean(UCrop.Options.EXTRA_CIRCLE_DIMMED_LAYER, OverlayView.DEFAULT_CIRCLE_DIMMED_LAYER));
-
-        mOverlayView.setShowCropFrame(bundle.getBoolean(UCrop.Options.EXTRA_SHOW_CROP_FRAME, OverlayView.DEFAULT_SHOW_CROP_FRAME));
-        mOverlayView.setCropFrameColor(bundle.getInt(UCrop.Options.EXTRA_CROP_FRAME_COLOR, getResources().getColor(R.color.ucrop_color_default_crop_frame)));
-        mOverlayView.setCropFrameStrokeWidth(bundle.getInt(UCrop.Options.EXTRA_CROP_FRAME_STROKE_WIDTH, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_frame_stoke_width)));
-
-        mOverlayView.setShowCropGrid(bundle.getBoolean(UCrop.Options.EXTRA_SHOW_CROP_GRID, OverlayView.DEFAULT_SHOW_CROP_GRID));
-        mOverlayView.setCropGridRowCount(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_ROW_COUNT, OverlayView.DEFAULT_CROP_GRID_ROW_COUNT));
-        mOverlayView.setCropGridColumnCount(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_COLUMN_COUNT, OverlayView.DEFAULT_CROP_GRID_COLUMN_COUNT));
-        mOverlayView.setCropGridColor(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_COLOR, getResources().getColor(R.color.ucrop_color_default_crop_grid)));
-        mOverlayView.setCropGridStrokeWidth(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_STROKE_WIDTH, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_grid_stoke_width)));
-
-        // Aspect ratio options
-        float aspectRatioX = bundle.getFloat(UCrop.EXTRA_ASPECT_RATIO_X, 0);
-        float aspectRatioY = bundle.getFloat(UCrop.EXTRA_ASPECT_RATIO_Y, 0);
-
-        int aspectRationSelectedByDefault = bundle.getInt(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0);
-        ArrayList<AspectRatio> aspectRatioList = bundle.getParcelableArrayList(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS);
-
-        if (aspectRatioX > 0 && aspectRatioY > 0) {
-            if (mWrapperStateAspectRatio != null) {
-                mWrapperStateAspectRatio.setVisibility(View.GONE);
-            }
-            mGestureCropImageView.setTargetAspectRatio(aspectRatioX / aspectRatioY);
-        } else if (aspectRatioList != null && aspectRationSelectedByDefault < aspectRatioList.size()) {
-            mGestureCropImageView.setTargetAspectRatio(aspectRatioList.get(aspectRationSelectedByDefault).getAspectRatioX() /
-                    aspectRatioList.get(aspectRationSelectedByDefault).getAspectRatioY());
-        } else {
-            mGestureCropImageView.setTargetAspectRatio(CropImageView.SOURCE_IMAGE_ASPECT_RATIO);
-        }
-
-        // Result bitmap max size options
-        int maxSizeX = bundle.getInt(UCrop.EXTRA_MAX_SIZE_X, 0);
-        int maxSizeY = bundle.getInt(UCrop.EXTRA_MAX_SIZE_Y, 0);
-
-        if (maxSizeX > 0 && maxSizeY > 0) {
-            mGestureCropImageView.setMaxResultImageSizeX(maxSizeX);
-            mGestureCropImageView.setMaxResultImageSizeY(maxSizeY);
-        }
-    }
-
-    private void initiateRootViews(View view) {
-        mUCropView = view.findViewById(R.id.ucrop);
-        mGestureCropImageView = mUCropView.getCropImageView();
-        mOverlayView = mUCropView.getOverlayView();
-
-        mGestureCropImageView.setTransformImageListener(mImageListener);
-
-        ((ImageView) view.findViewById(R.id.image_view_logo)).setColorFilter(mLogoColor, PorterDuff.Mode.SRC_ATOP);
-
-        view.findViewById(R.id.ucrop_frame).setBackgroundColor(mRootViewBackgroundColor);
-    }
-
-    /**
-     * Use {@link #mActiveControlsWidgetColor} for color filter
-     */
-    private void setupStatesWrapper(View view) {
-        ImageView stateScaleImageView = view.findViewById(R.id.image_view_state_scale);
-        ImageView stateRotateImageView = view.findViewById(R.id.image_view_state_rotate);
-        ImageView stateAspectRatioImageView = view.findViewById(R.id.image_view_state_aspect_ratio);
-
-        stateScaleImageView.setImageDrawable(new SelectedStateListDrawable(stateScaleImageView.getDrawable(), mActiveControlsWidgetColor));
-        stateRotateImageView.setImageDrawable(new SelectedStateListDrawable(stateRotateImageView.getDrawable(), mActiveControlsWidgetColor));
-        stateAspectRatioImageView.setImageDrawable(new SelectedStateListDrawable(stateAspectRatioImageView.getDrawable(), mActiveControlsWidgetColor));
-    }
-
-    private void setupAspectRatioWidget(@NonNull Bundle bundle, View view) {
-        int aspectRationSelectedByDefault = bundle.getInt(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0);
-        ArrayList<AspectRatio> aspectRatioList = bundle.getParcelableArrayList(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS);
-
-        if (aspectRatioList == null || aspectRatioList.isEmpty()) {
-            aspectRationSelectedByDefault = 2;
-
-            aspectRatioList = new ArrayList<>();
-            aspectRatioList.add(new AspectRatio(null, 1, 1));
-            aspectRatioList.add(new AspectRatio(null, 3, 4));
-            aspectRatioList.add(new AspectRatio(getString(R.string.ucrop_label_original).toUpperCase(),
-                    CropImageView.SOURCE_IMAGE_ASPECT_RATIO, CropImageView.SOURCE_IMAGE_ASPECT_RATIO));
-            aspectRatioList.add(new AspectRatio(null, 3, 2));
-            aspectRatioList.add(new AspectRatio(null, 16, 9));
-        }
-
-        LinearLayout wrapperAspectRatioList = view.findViewById(R.id.layout_aspect_ratio);
-
-        FrameLayout wrapperAspectRatio;
-        AspectRatioTextView aspectRatioTextView;
-        LinearLayout.LayoutParams lp = new LinearLayout.LayoutParams(0, ViewGroup.LayoutParams.MATCH_PARENT);
-        lp.weight = 1;
-        for (AspectRatio aspectRatio : aspectRatioList) {
-            wrapperAspectRatio = (FrameLayout) getLayoutInflater().inflate(R.layout.ucrop_aspect_ratio, null);
-            wrapperAspectRatio.setLayoutParams(lp);
-            aspectRatioTextView = ((AspectRatioTextView) wrapperAspectRatio.getChildAt(0));
-            aspectRatioTextView.setActiveColor(mActiveControlsWidgetColor);
-            aspectRatioTextView.setAspectRatio(aspectRatio);
-
-            wrapperAspectRatioList.addView(wrapperAspectRatio);
-            mCropAspectRatioViews.add(wrapperAspectRatio);
-        }
-
-        mCropAspectRatioViews.get(aspectRationSelectedByDefault).setSelected(true);
-
-        for (ViewGroup cropAspectRatioView : mCropAspectRatioViews) {
-            cropAspectRatioView.setOnClickListener(v -> {
-                mGestureCropImageView.setTargetAspectRatio(
-                        ((AspectRatioTextView) ((ViewGroup) v).getChildAt(0)).getAspectRatio(v.isSelected()));
-                mGestureCropImageView.setImageToWrapCropBounds();
-                if (!v.isSelected()) {
-                    for (ViewGroup cropAspectRatioView1 : mCropAspectRatioViews) {
-                        cropAspectRatioView1.setSelected(cropAspectRatioView1 == v);
-                    }
-                }
-            });
-        }
-    }
-
-    private void setupRotateWidget(View view) {
-        mTextViewRotateAngle = view.findViewById(R.id.text_view_rotate);
-        ((HorizontalProgressWheelView) view.findViewById(R.id.rotate_scroll_wheel))
-                .setScrollingListener(new HorizontalProgressWheelView.ScrollingListener() {
-                    @Override
-                    public void onScroll(float delta, float totalDistance) {
-                        mGestureCropImageView.postRotate(delta / ROTATE_WIDGET_SENSITIVITY_COEFFICIENT);
-                    }
-
-                    @Override
-                    public void onScrollEnd() {
-                        mGestureCropImageView.setImageToWrapCropBounds();
-                    }
-
-                    @Override
-                    public void onScrollStart() {
-                        mGestureCropImageView.cancelAllAnimations();
-                    }
-                });
-
-        ((HorizontalProgressWheelView) view.findViewById(R.id.rotate_scroll_wheel)).setMiddleLineColor(mActiveControlsWidgetColor);
-
-
-        view.findViewById(R.id.wrapper_reset_rotate).setOnClickListener(v -> resetRotation());
-        view.findViewById(R.id.wrapper_rotate_by_angle).setOnClickListener(v -> rotateByAngle(90));
-
-        setAngleTextColor(mActiveControlsWidgetColor);
-    }
-
-    private void setupScaleWidget(View view) {
-        mTextViewScalePercent = view.findViewById(R.id.text_view_scale);
-        ((HorizontalProgressWheelView) view.findViewById(R.id.scale_scroll_wheel))
-                .setScrollingListener(new HorizontalProgressWheelView.ScrollingListener() {
-                    @Override
-                    public void onScroll(float delta, float totalDistance) {
-                        if (delta > 0) {
-                            mGestureCropImageView.zoomInImage(mGestureCropImageView.getCurrentScale()
-                                    + delta * ((mGestureCropImageView.getMaxScale() - mGestureCropImageView.getMinScale()) / SCALE_WIDGET_SENSITIVITY_COEFFICIENT));
-                        } else {
-                            mGestureCropImageView.zoomOutImage(mGestureCropImageView.getCurrentScale()
-                                    + delta * ((mGestureCropImageView.getMaxScale() - mGestureCropImageView.getMinScale()) / SCALE_WIDGET_SENSITIVITY_COEFFICIENT));
-                        }
-                    }
-
-                    @Override
-                    public void onScrollEnd() {
-                        mGestureCropImageView.setImageToWrapCropBounds();
-                    }
-
-                    @Override
-                    public void onScrollStart() {
-                        mGestureCropImageView.cancelAllAnimations();
-                    }
-                });
-        ((HorizontalProgressWheelView) view.findViewById(R.id.scale_scroll_wheel)).setMiddleLineColor(mActiveControlsWidgetColor);
-
-        setScaleTextColor(mActiveControlsWidgetColor);
-    }
-
-    void setAngleText(float angle) {
-        if (mTextViewRotateAngle != null) {
-            mTextViewRotateAngle.setText(String.format(Locale.getDefault(), "%.1f°", angle));
-        }
-    }
-
-    private void setAngleTextColor(int textColor) {
-        if (mTextViewRotateAngle != null) {
-            mTextViewRotateAngle.setTextColor(textColor);
-        }
-    }
-
-    void setScaleText(float scale) {
-        if (mTextViewScalePercent != null) {
-            mTextViewScalePercent.setText(String.format(Locale.getDefault(), "%d%%", (int) (scale * 100)));
-        }
-    }
-
-    private void setScaleTextColor(int textColor) {
-        if (mTextViewScalePercent != null) {
-            mTextViewScalePercent.setTextColor(textColor);
-        }
-    }
-
-    private void resetRotation() {
-        mGestureCropImageView.postRotate(-mGestureCropImageView.getCurrentAngle());
-        mGestureCropImageView.setImageToWrapCropBounds();
-    }
-
-    private void rotateByAngle(int angle) {
-        mGestureCropImageView.postRotate(angle);
-        mGestureCropImageView.setImageToWrapCropBounds();
-    }
-
-    private void setInitialState() {
-        if (mShowBottomControls) {
-            if (mWrapperStateAspectRatio.getVisibility() == View.VISIBLE) {
-                setWidgetState(R.id.state_aspect_ratio);
-            } else {
-                setWidgetState(R.id.state_scale);
-            }
-        } else {
-            setAllowedGestures(0);
-        }
-    }
-
-    private void setWidgetState(@IdRes int stateViewId) {
-        if (!mShowBottomControls) return;
-
-        mWrapperStateAspectRatio.setSelected(stateViewId == R.id.state_aspect_ratio);
-        mWrapperStateRotate.setSelected(stateViewId == R.id.state_rotate);
-        mWrapperStateScale.setSelected(stateViewId == R.id.state_scale);
-
-        mLayoutAspectRatio.setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE);
-        mLayoutRotate.setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE);
-        mLayoutScale.setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE);
-
-        changeSelectedTab(stateViewId);
-
-        if (stateViewId == R.id.state_scale) {
-            setAllowedGestures(0);
-        } else if (stateViewId == R.id.state_rotate) {
-            setAllowedGestures(1);
-        } else {
-            setAllowedGestures(2);
-        }
-    }
-
-    private void changeSelectedTab(int stateViewId) {
-        if (getView() != null) {
-            TransitionManager.beginDelayedTransition(getView().findViewById(R.id.ucrop_photobox), mControlsTransition);
-        }
-        mWrapperStateScale.findViewById(R.id.text_view_scale).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE);
-        mWrapperStateAspectRatio.findViewById(R.id.text_view_crop).setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE);
-        mWrapperStateRotate.findViewById(R.id.text_view_rotate).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE);
-    }
-
-    private void setAllowedGestures(int tab) {
-        mGestureCropImageView.setScaleEnabled(mAllowedGestures[tab] == ALL || mAllowedGestures[tab] == SCALE);
-        mGestureCropImageView.setRotateEnabled(mAllowedGestures[tab] == ALL || mAllowedGestures[tab] == ROTATE);
-    }
-
-    /**
-     * Adds view that covers everything below the Toolbar.
-     * When it's clickable - user won't be able to click/touch anything below the Toolbar.
-     * Need to block user input while loading and cropping an image.
-     */
-    private void addBlockingView(View view) {
-        if (mBlockingView == null) {
-            mBlockingView = new View(getContext());
-            RelativeLayout.LayoutParams lp = new RelativeLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.MATCH_PARENT);
-            mBlockingView.setLayoutParams(lp);
-            mBlockingView.setClickable(true);
-        }
-
-        ((RelativeLayout) view.findViewById(R.id.ucrop_photobox)).addView(mBlockingView);
-    }
-
-    public void cropAndSaveImage() {
-        mBlockingView.setClickable(true);
-        callback.loadingProgress(true);
-
-        mGestureCropImageView.cropAndSaveImage(mCompressFormat, mCompressQuality, new BitmapCropCallback() {
-
-            @Override
-            public void onBitmapCropped(@NonNull Uri resultUri, int offsetX, int offsetY, int imageWidth, int imageHeight) {
-                callback.onCropFinish(getResult(resultUri, mGestureCropImageView.getTargetAspectRatio(), offsetX, offsetY, imageWidth, imageHeight));
-                callback.loadingProgress(false);
-            }
-
-            @Override
-            public void onCropFailure(@NonNull Throwable t) {
-                callback.onCropFinish(getError(t));
-            }
-        });
-    }
-
-    protected UCropResult getResult(Uri uri, float resultAspectRatio, int offsetX, int offsetY, int imageWidth, int imageHeight) {
-        return new UCropResult(RESULT_OK, new Intent()
-                .putExtra(UCrop.EXTRA_OUTPUT_URI, uri)
-                .putExtra(UCrop.EXTRA_OUTPUT_CROP_ASPECT_RATIO, resultAspectRatio)
-                .putExtra(UCrop.EXTRA_OUTPUT_IMAGE_WIDTH, imageWidth)
-                .putExtra(UCrop.EXTRA_OUTPUT_IMAGE_HEIGHT, imageHeight)
-                .putExtra(UCrop.EXTRA_OUTPUT_OFFSET_X, offsetX)
-                .putExtra(UCrop.EXTRA_OUTPUT_OFFSET_Y, offsetY)
-        );
-    }
-
-    protected UCropResult getError(Throwable throwable) {
-        return new UCropResult(UCrop.RESULT_ERROR, new Intent().putExtra(UCrop.EXTRA_ERROR, throwable));
-    }
-
-    @IntDef({NONE, SCALE, ROTATE, ALL})
-    @Retention(RetentionPolicy.SOURCE)
-    public @interface GestureTypes {
-    }
-
-    public static class UCropResult {
-
-        public final int mResultCode;
-        public final Intent mResultData;
-
-        public UCropResult(int resultCode, Intent data) {
-            mResultCode = resultCode;
-            mResultData = data;
-        }
-
-    }
-
-}
-
diff --git a/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java b/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java
deleted file mode 100644
index 31ce42554..000000000
--- a/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java
+++ /dev/null
@@ -1,15 +0,0 @@
-package com.yalantis.ucrop;
-
-public interface UCropFragmentCallback {
-
-    /**
-     * Return loader status
-     */
-    void loadingProgress(boolean showLoader);
-
-    /**
-     * Return cropping result or error
-     */
-    void onCropFinish(UCropFragment.UCropResult result);
-
-}
diff --git a/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java b/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java
index 9132171e6..1b7aaabd9 100644
--- a/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java
+++ b/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java
@@ -1,5 +1,6 @@
 package com.yalantis.ucrop.task;
 
+import android.annotation.SuppressLint;
 import android.content.Context;
 import android.graphics.Bitmap;
 import android.graphics.Matrix;
@@ -83,6 +84,7 @@ private static CompletableTransformer applyCompletableIOToMainSchedulers() {
                 .observeOn(AndroidSchedulers.mainThread());
     }
 
+    @SuppressLint("CheckResult")
     public void execute() {
         doInBackground().compose(applyCompletableIOToMainSchedulers())
                 .subscribe(() -> {
diff --git a/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java b/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java
index 196d96c34..9e74a6e13 100644
--- a/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java
+++ b/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java
@@ -62,6 +62,7 @@ private static <T> SingleTransformer<T, T> applySingleIOToMainSchedulers() {
                 .observeOn(AndroidSchedulers.mainThread());
     }
 
+    @SuppressLint("CheckResult")
     public void execute() {
         doInBackground().compose(applySingleIOToMainSchedulers()).subscribe(this::onPostExecute, e -> onPostExecute(new BitmapWorkerResult((Exception) e)));
     }
diff --git a/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java b/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java
index 7b8e5a8ec..720607389 100644
--- a/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java
+++ b/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java
@@ -1,15 +1,14 @@
 package com.yalantis.ucrop.util;
 
 import android.content.Context;
+import android.content.res.Resources;
 import android.graphics.Bitmap;
 import android.graphics.BitmapFactory;
 import android.graphics.Canvas;
 import android.graphics.Matrix;
-import android.graphics.Point;
 import android.net.Uri;
+import android.util.DisplayMetrics;
 import android.util.Log;
-import android.view.Display;
-import android.view.WindowManager;
 
 import androidx.annotation.NonNull;
 import androidx.annotation.Nullable;
@@ -124,19 +123,13 @@ public static int exifToTranslation(int exifOrientation) {
      *
      * @return - max bitmap size in pixels.
      */
-    public static int calculateMaxBitmapSize(@NonNull Context context) {
-        WindowManager wm = (WindowManager) context.getSystemService(Context.WINDOW_SERVICE);
-        Display display;
+    public static int calculateMaxBitmapSize() {
         int width, height;
-        Point size = new Point();
 
-        if (wm != null) {
-            display = wm.getDefaultDisplay();
-            display.getSize(size);
-        }
+        DisplayMetrics dm = Resources.getSystem().getDisplayMetrics();
 
-        width = size.x;
-        height = size.y;
+        width = dm.widthPixels;
+        height = dm.heightPixels;
 
         // Twice the device screen diagonal as default
         int maxBitmapSize = (int) Math.sqrt(Math.pow(width, 2) + Math.pow(height, 2));
diff --git a/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java b/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java
index 53c621bb5..74913c118 100644
--- a/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java
+++ b/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java
@@ -22,6 +22,8 @@
 import android.graphics.PixelFormat;
 import android.graphics.drawable.Drawable;
 
+import androidx.annotation.NonNull;
+
 public class FastBitmapDrawable extends Drawable {
 
     private final Paint mPaint = new Paint(Paint.FILTER_BITMAP_FLAG);
@@ -36,7 +38,7 @@ public FastBitmapDrawable(Bitmap b) {
     }
 
     @Override
-    public void draw(Canvas canvas) {
+    public void draw(@NonNull Canvas canvas) {
         if (mBitmap != null && !mBitmap.isRecycled()) {
             canvas.drawBitmap(mBitmap, null, getBounds(), mPaint);
         }
diff --git a/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java b/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java
index 9943047cc..b0d02e483 100644
--- a/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java
+++ b/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java
@@ -1,6 +1,7 @@
 package com.yalantis.ucrop.util;
 
 import android.graphics.PorterDuff;
+import android.graphics.PorterDuffColorFilter;
 import android.graphics.drawable.Drawable;
 import android.graphics.drawable.StateListDrawable;
 
@@ -27,7 +28,7 @@ protected boolean onStateChange(int[] states) {
             }
         }
         if (isStatePressedInArray) {
-            setColorFilter(mSelectionColor, PorterDuff.Mode.SRC_ATOP);
+            setColorFilter(new PorterDuffColorFilter(mSelectionColor, PorterDuff.Mode.SRC_ATOP));
         } else {
             clearColorFilter();
         }
diff --git a/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java b/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java
index a597995f4..0709c9f07 100644
--- a/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java
+++ b/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java
@@ -7,6 +7,8 @@
 import android.view.MotionEvent;
 import android.view.ScaleGestureDetector;
 
+import androidx.annotation.NonNull;
+
 import com.yalantis.ucrop.util.RotationGestureDetector;
 
 /**
@@ -131,7 +133,7 @@ public boolean onDoubleTap(MotionEvent e) {
         }
 
         @Override
-        public boolean onScroll(MotionEvent e1, MotionEvent e2, float distanceX, float distanceY) {
+        public boolean onScroll(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float distanceX, float distanceY) {
             postTranslate(-distanceX, -distanceY);
             return true;
         }
diff --git a/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java b/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java
index 4ee7747b4..03f9b1c62 100644
--- a/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java
+++ b/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java
@@ -8,6 +8,7 @@
 import android.graphics.Path;
 import android.graphics.RectF;
 import android.graphics.Region;
+import android.os.Build;
 import android.util.AttributeSet;
 import android.view.MotionEvent;
 import android.view.View;
@@ -16,6 +17,7 @@
 import androidx.annotation.IntDef;
 import androidx.annotation.IntRange;
 import androidx.annotation.NonNull;
+import androidx.core.content.ContextCompat;
 
 import com.yalantis.ucrop.callback.OverlayViewChangeListener;
 import com.yalantis.ucrop.util.RectUtils;
@@ -436,12 +438,21 @@ private int getCurrentTouchIndex(float touchX, float touchY) {
      *
      * @param canvas - valid canvas object
      */
+    @SuppressWarnings("deprecation")
     protected void drawDimmedLayer(@NonNull Canvas canvas) {
         canvas.save();
         if (mCircleDimmedLayer) {
-            canvas.clipPath(mCircularPath, Region.Op.DIFFERENCE);
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+                canvas.clipOutPath(mCircularPath);
+            } else {
+                canvas.clipPath(mCircularPath, Region.Op.DIFFERENCE);
+            }
         } else {
-            canvas.clipRect(mCropViewRect, Region.Op.DIFFERENCE);
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+                canvas.clipOutRect(mCropViewRect);
+            } else {
+                canvas.clipRect(mCropViewRect, Region.Op.DIFFERENCE);
+            }
         }
         canvas.drawColor(mDimmedColor);
         canvas.restore();
@@ -458,6 +469,7 @@ protected void drawDimmedLayer(@NonNull Canvas canvas) {
      *
      * @param canvas - valid canvas object
      */
+    @SuppressWarnings("deprecation")
     protected void drawCropGrid(@NonNull Canvas canvas) {
         if (mShowCropGrid) {
             if (mGridPoints == null && !mCropViewRect.isEmpty()) {
@@ -494,11 +506,19 @@ protected void drawCropGrid(@NonNull Canvas canvas) {
 
             mTempRect.set(mCropViewRect);
             mTempRect.inset(mCropRectCornerTouchAreaLineLength, -mCropRectCornerTouchAreaLineLength);
-            canvas.clipRect(mTempRect, Region.Op.DIFFERENCE);
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+                canvas.clipOutRect(mTempRect);
+            } else {
+                canvas.clipRect(mTempRect, Region.Op.DIFFERENCE);
+            }
 
             mTempRect.set(mCropViewRect);
             mTempRect.inset(-mCropRectCornerTouchAreaLineLength, mCropRectCornerTouchAreaLineLength);
-            canvas.clipRect(mTempRect, Region.Op.DIFFERENCE);
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+                canvas.clipOutRect(mTempRect);
+            } else {
+                canvas.clipRect(mTempRect, Region.Op.DIFFERENCE);
+            }
 
             canvas.drawRect(mCropViewRect, mCropFrameCornersPaint);
 
@@ -513,7 +533,7 @@ protected void drawCropGrid(@NonNull Canvas canvas) {
     protected void processStyledAttributes(@NonNull TypedArray a) {
         mCircleDimmedLayer = a.getBoolean(R.styleable.ucrop_UCropView_ucrop_circle_dimmed_layer, DEFAULT_CIRCLE_DIMMED_LAYER);
         mDimmedColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_dimmed_color,
-                getResources().getColor(R.color.ucrop_color_default_dimmed));
+                ContextCompat.getColor(getContext(), R.color.ucrop_color_default_dimmed));
         mDimmedStrokePaint.setColor(mDimmedColor);
         mDimmedStrokePaint.setStyle(Paint.Style.STROKE);
         mDimmedStrokePaint.setStrokeWidth(1);
@@ -532,7 +552,7 @@ private void initCropFrameStyle(@NonNull TypedArray a) {
         int cropFrameStrokeSize = a.getDimensionPixelSize(R.styleable.ucrop_UCropView_ucrop_frame_stroke_size,
                 getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_frame_stoke_width));
         int cropFrameColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_frame_color,
-                getResources().getColor(R.color.ucrop_color_default_crop_frame));
+                ContextCompat.getColor(getContext(), R.color.ucrop_color_default_crop_frame));
         mCropFramePaint.setStrokeWidth(cropFrameStrokeSize);
         mCropFramePaint.setColor(cropFrameColor);
         mCropFramePaint.setStyle(Paint.Style.STROKE);
@@ -549,7 +569,7 @@ private void initCropGridStyle(@NonNull TypedArray a) {
         int cropGridStrokeSize = a.getDimensionPixelSize(R.styleable.ucrop_UCropView_ucrop_grid_stroke_size,
                 getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_grid_stoke_width));
         int cropGridColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_grid_color,
-                getResources().getColor(R.color.ucrop_color_default_crop_grid));
+                ContextCompat.getColor(getContext(), R.color.ucrop_color_default_crop_grid));
         mCropGridPaint.setStrokeWidth(cropGridStrokeSize);
         mCropGridPaint.setColor(cropGridColor);
 
diff --git a/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java b/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java
index ac0914a51..5edfca37d 100644
--- a/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java
+++ b/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java
@@ -77,7 +77,7 @@ public void setScaleType(ScaleType scaleType) {
 
     public int getMaxBitmapSize() {
         if (mMaxBitmapSize <= 0) {
-            mMaxBitmapSize = BitmapLoadUtils.calculateMaxBitmapSize(getContext());
+            mMaxBitmapSize = BitmapLoadUtils.calculateMaxBitmapSize();
         }
         return mMaxBitmapSize;
     }
diff --git a/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java b/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java
index 93c3fd9f6..c66559fd8 100644
--- a/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java
+++ b/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java
@@ -124,7 +124,7 @@ private void init(@NonNull TypedArray a) {
 
         setTitle();
 
-        int activeColor = getResources().getColor(R.color.ucrop_color_widget_active);
+        int activeColor = ContextCompat.getColor(getContext(), R.color.ucrop_color_widget_active);
         applyActiveColor(activeColor);
 
         a.recycle();
diff --git a/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java b/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java
index cb4004294..bbbbb6d96 100644
--- a/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java
+++ b/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java
@@ -139,7 +139,7 @@ private void init() {
         mProgressLinePaint = new Paint(Paint.ANTI_ALIAS_FLAG);
         mProgressLinePaint.setStyle(Paint.Style.STROKE);
         mProgressLinePaint.setStrokeWidth(mProgressLineWidth);
-        mProgressLinePaint.setColor(getResources().getColor(R.color.ucrop_color_progress_wheel_line));
+        mProgressLinePaint.setColor(ContextCompat.getColor(getContext(), R.color.ucrop_color_progress_wheel_line));
 
         mProgressMiddleLinePaint = new Paint(mProgressLinePaint);
         mProgressMiddleLinePaint.setColor(mMiddleLineColor);
diff --git a/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java b/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java
index 711191777..07ad97ce6 100644
--- a/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java
+++ b/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java
@@ -4,6 +4,7 @@
 import android.graphics.Bitmap;
 import android.graphics.BitmapFactory;
 import android.net.Uri;
+import android.os.Build;
 import android.text.TextUtils;
 
 import java.io.FileNotFoundException;
@@ -33,6 +34,7 @@ public void onCreated() {
 
     }
 
+    @SuppressWarnings("deprecation")
     @Override
     public Bitmap getBitmap() {
         Intent intent = getIntent();
@@ -40,7 +42,12 @@ public Bitmap getBitmap() {
             return null;
         }
 
-        Uri uri = intent.getParcelableExtra(EXTRA_IMAGE_URI);
+        Uri uri;
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
+            uri = intent.getParcelableExtra(EXTRA_IMAGE_URI, Uri.class);
+        } else {
+            uri = intent.getParcelableExtra(EXTRA_IMAGE_URI);
+        }
         if (uri == null) {
             return null;
         }
@@ -48,7 +55,7 @@ public Bitmap getBitmap() {
         IMGDecoder decoder = null;
 
         String path = uri.getPath();
-        if (!TextUtils.isEmpty(path)) {
+        if (!TextUtils.isEmpty(path) && uri.getScheme() != null) {
             switch (uri.getScheme()) {
                 case "asset":
                     decoder = new IMGAssetFileDecoder(this, uri);
diff --git a/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java b/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java
index 9faf76a1e..b5a53fa2f 100644
--- a/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java
+++ b/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java
@@ -18,8 +18,7 @@
  * Created by felix on 2017/12/5 下午3:08.
  */
 
-abstract class IMGEditBaseActivity extends AppCompatActivity implements View.OnClickListener,
-        IMGTextEditDialog.Callback, RadioGroup.OnCheckedChangeListener,
+abstract class IMGEditBaseActivity extends AppCompatActivity implements IMGTextEditDialog.Callback, RadioGroup.OnCheckedChangeListener,
         DialogInterface.OnShowListener, DialogInterface.OnDismissListener {
 
     public static final int OP_HIDE = -1;
@@ -61,34 +60,19 @@ private void initViews() {
         mColorGroup.setOnCheckedChangeListener(this);
 
         mLayoutOpSub = findViewById(R.id.layout_op_sub);
-    }
 
-    @Override
-    public void onClick(View v) {
-        int vid = v.getId();
-        if (vid == R.id.rb_doodle) {
-            onModeClick(IMGMode.DOODLE);
-        } else if (vid == R.id.btn_text) {
-            onTextModeClick();
-        } else if (vid == R.id.rb_mosaic) {
-            onModeClick(IMGMode.MOSAIC);
-        } else if (vid == R.id.btn_clip) {
-            onModeClick(IMGMode.CLIP);
-        } else if (vid == R.id.btn_undo) {
-            onUndoClick();
-        } else if (vid == R.id.tv_done) {
-            onDoneClick();
-        } else if (vid == R.id.tv_cancel) {
-            onCancelClick();
-        } else if (vid == R.id.ib_clip_cancel) {
-            onCancelClipClick();
-        } else if (vid == R.id.ib_clip_done) {
-            onDoneClipClick();
-        } else if (vid == R.id.tv_clip_reset) {
-            onResetClipClick();
-        } else if (vid == R.id.ib_clip_rotate) {
-            onRotateClipClick();
-        }
+        findViewById(R.id.ib_clip_rotate).setOnClickListener(v -> onRotateClipClick());
+        findViewById(R.id.ib_clip_cancel).setOnClickListener(v -> onCancelClipClick());
+        findViewById(R.id.tv_clip_reset).setOnClickListener(v -> onResetClipClick());
+        findViewById(R.id.ib_clip_done).setOnClickListener(v -> onDoneClipClick());
+
+        findViewById(R.id.rb_doodle).setOnClickListener(v -> onModeClick(IMGMode.DOODLE));
+        findViewById(R.id.btn_text).setOnClickListener(v -> onTextModeClick());
+        findViewById(R.id.rb_mosaic).setOnClickListener(v -> onModeClick(IMGMode.MOSAIC));
+        findViewById(R.id.btn_clip).setOnClickListener(v -> onModeClick(IMGMode.CLIP));
+        findViewById(R.id.btn_undo).setOnClickListener(v -> onUndoClick());
+        findViewById(R.id.tv_done).setOnClickListener(v -> onDoneClick());
+        findViewById(R.id.tv_cancel).setOnClickListener(v -> onCancelClick());
     }
 
     public void updateModeUI() {
diff --git a/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java b/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java
index 026c8675e..29f44f591 100644
--- a/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java
+++ b/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java
@@ -8,9 +8,10 @@
 import android.view.View;
 import android.view.ViewGroup;
 import android.view.Window;
-import android.widget.EditText;
 import android.widget.RadioGroup;
 
+import com.google.android.material.textfield.TextInputEditText;
+
 import me.minetsh.imaging.core.IMGText;
 import me.minetsh.imaging.view.IMGColorGroup;
 
@@ -23,7 +24,7 @@ public class IMGTextEditDialog extends Dialog implements View.OnClickListener,
 
     private static final String TAG = "IMGTextEditDialog";
     private final Callback mCallback;
-    private EditText mEditText;
+    private TextInputEditText mEditText;
     private IMGText mDefaultText;
 
     private IMGColorGroup mColorGroup;
diff --git a/image/src/main/java/me/minetsh/imaging/core/IMGImage.java b/image/src/main/java/me/minetsh/imaging/core/IMGImage.java
index ede5a2da7..49c91171c 100644
--- a/image/src/main/java/me/minetsh/imaging/core/IMGImage.java
+++ b/image/src/main/java/me/minetsh/imaging/core/IMGImage.java
@@ -507,7 +507,7 @@ public void onDrawImage(Canvas canvas) {
     }
 
     public int onDrawMosaicsPath(Canvas canvas) {
-        int layerCount = canvas.saveLayer(mFrame, null, Canvas.ALL_SAVE_FLAG);
+        int layerCount = canvas.saveLayer(mFrame, null);
 
         if (!isMosaicEmpty()) {
             canvas.save();
diff --git a/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java b/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java
index ea07f6735..9089dd31a 100644
--- a/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java
+++ b/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java
@@ -3,6 +3,8 @@
 import android.content.res.Resources;
 import android.graphics.Canvas;
 import android.graphics.Paint;
+import android.graphics.text.LineBreaker;
+import android.os.Build;
 import android.text.Layout;
 import android.text.StaticLayout;
 import android.text.TextPaint;
@@ -16,7 +18,6 @@
  */
 
 public class IMGStickerXText extends IMGStickerX {
-
     private final TextPaint mTextPaint = new TextPaint(Paint.ANTI_ALIAS_FLAG);
     private StaticLayout mTextLayout;
 
@@ -28,12 +29,28 @@ public IMGStickerXText(IMGText text) {
         setText(text);
     }
 
+    @SuppressWarnings("deprecation")
     public void setText(IMGText text) {
-
         mTextPaint.setColor(text.getColor());
-        mTextLayout = new StaticLayout(text.getText(), mTextPaint,
-                Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f),
-                Layout.Alignment.ALIGN_NORMAL, 1f, 0, false);
+        if (Build.VERSION.SDK_INT < Build.VERSION_CODES.M) {
+            mTextLayout = new StaticLayout(text.getText(), mTextPaint,
+                    Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f),
+                    Layout.Alignment.ALIGN_NORMAL, 1f, 0, false);
+        } else {
+            StaticLayout.Builder builder =
+                    StaticLayout.Builder.obtain(text.getText(), 0, text.length(), mTextPaint, Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f))
+                            .setAlignment(Layout.Alignment.ALIGN_NORMAL)
+                            .setLineSpacing(0.f, 1.f)
+                            .setIncludePad(false);
+
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+                builder.setJustificationMode(LineBreaker.JUSTIFICATION_MODE_INTER_WORD);
+            }
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.P) {
+                builder.setUseLineSpacingFromFallbacks(true);
+            }
+            mTextLayout = builder.build();
+        }
 
         float width = 0f;
         for (int i = 0; i < mTextLayout.getLineCount(); i++) {
diff --git a/image/src/main/java/me/minetsh/imaging/view/IMGView.java b/image/src/main/java/me/minetsh/imaging/view/IMGView.java
index 633f1fc94..842fa6d51 100644
--- a/image/src/main/java/me/minetsh/imaging/view/IMGView.java
+++ b/image/src/main/java/me/minetsh/imaging/view/IMGView.java
@@ -23,6 +23,8 @@
 import android.view.ViewParent;
 import android.widget.FrameLayout;
 
+import androidx.annotation.NonNull;
+
 import me.minetsh.imaging.core.IMGImage;
 import me.minetsh.imaging.core.IMGMode;
 import me.minetsh.imaging.core.IMGPath;
@@ -458,7 +460,7 @@ protected void onDetachedFromWindow() {
     }
 
     @Override
-    public boolean onScale(ScaleGestureDetector detector) {
+    public boolean onScale(@NonNull ScaleGestureDetector detector) {
         if (mPointerCount > 1) {
             mImage.onScale(detector.getScaleFactor(),
                     getScrollX() + detector.getFocusX(),
@@ -470,7 +472,7 @@ public boolean onScale(ScaleGestureDetector detector) {
     }
 
     @Override
-    public boolean onScaleBegin(ScaleGestureDetector detector) {
+    public boolean onScaleBegin(@NonNull ScaleGestureDetector detector) {
         if (mPointerCount > 1) {
             mImage.onScaleBegin();
             return true;
@@ -479,7 +481,7 @@ public boolean onScaleBegin(ScaleGestureDetector detector) {
     }
 
     @Override
-    public void onScaleEnd(ScaleGestureDetector detector) {
+    public void onScaleEnd(@NonNull ScaleGestureDetector detector) {
         mImage.onScaleEnd();
     }
 
@@ -532,7 +534,7 @@ public <V extends View & IMGSticker> boolean onRemove(V stickerView) {
     }
 
     @Override
-    public void onAnimationStart(Animator animation) {
+    public void onAnimationStart(@NonNull Animator animation) {
         if (DEBUG) {
             Log.d(TAG, "onAnimationStart");
         }
@@ -540,7 +542,7 @@ public void onAnimationStart(Animator animation) {
     }
 
     @Override
-    public void onAnimationEnd(Animator animation) {
+    public void onAnimationEnd(@NonNull Animator animation) {
         if (DEBUG) {
             Log.d(TAG, "onAnimationEnd");
         }
@@ -550,7 +552,7 @@ public void onAnimationEnd(Animator animation) {
     }
 
     @Override
-    public void onAnimationCancel(Animator animation) {
+    public void onAnimationCancel(@NonNull Animator animation) {
         if (DEBUG) {
             Log.d(TAG, "onAnimationCancel");
         }
@@ -558,7 +560,7 @@ public void onAnimationCancel(Animator animation) {
     }
 
     @Override
-    public void onAnimationRepeat(Animator animation) {
+    public void onAnimationRepeat(@NonNull Animator animation) {
         // empty implementation.
     }
 
@@ -610,17 +612,17 @@ IMGPath toPath() {
     class MoveAdapter extends GestureDetector.SimpleOnGestureListener {
 
         @Override
-        public boolean onDown(MotionEvent e) {
+        public boolean onDown(@NonNull MotionEvent e) {
             return true;
         }
 
         @Override
-        public boolean onScroll(MotionEvent e1, MotionEvent e2, float distanceX, float distanceY) {
+        public boolean onScroll(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float distanceX, float distanceY) {
             return IMGView.this.onScroll(distanceX, distanceY);
         }
 
         @Override
-        public boolean onFling(MotionEvent e1, MotionEvent e2, float velocityX, float velocityY) {
+        public boolean onFling(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float velocityX, float velocityY) {
             // TODO
             return super.onFling(e1, e2, velocityX, velocityY);
         }
diff --git a/image/src/main/res/layout/image_edit_clip_layout.xml b/image/src/main/res/layout/image_edit_clip_layout.xml
index 9901af82a..8af6ea3b6 100644
--- a/image/src/main/res/layout/image_edit_clip_layout.xml
+++ b/image/src/main/res/layout/image_edit_clip_layout.xml
@@ -21,7 +21,6 @@
             android:layout_marginStart="20dp"
             android:background="@null"
             android:contentDescription="@string/image_rotate"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_rotate" />
 
         <FrameLayout
@@ -39,7 +38,6 @@
                 android:layout_marginStart="40dp"
                 android:background="@null"
                 android:contentDescription="@string/image_cancel"
-                android:onClick="onClick"
                 android:src="@drawable/image_btn_cancel" />
 
             <TextView
@@ -49,7 +47,6 @@
                 android:layout_gravity="center"
                 android:clickable="true"
                 android:focusable="true"
-                android:onClick="onClick"
                 android:text="@string/image_reset"
                 android:textColor="@color/image_color_text"
                 android:textSize="16sp" />
@@ -62,7 +59,6 @@
                 android:layout_marginEnd="40dp"
                 android:background="@null"
                 android:contentDescription="@string/image_done"
-                android:onClick="onClick"
                 android:src="@drawable/image_btn_ok" />
 
         </FrameLayout>
diff --git a/image/src/main/res/layout/image_edit_opt_layout.xml b/image/src/main/res/layout/image_edit_opt_layout.xml
index 5ddb04a44..8f0b993fe 100644
--- a/image/src/main/res/layout/image_edit_opt_layout.xml
+++ b/image/src/main/res/layout/image_edit_opt_layout.xml
@@ -21,7 +21,6 @@
             android:layout_marginTop="8dp"
             android:background="@null"
             android:contentDescription="@string/image_cancel"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_cancel" />
 
         <ImageButton
@@ -32,7 +31,6 @@
             android:layout_marginTop="8dp"
             android:background="@null"
             android:contentDescription="@string/image_done"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_ok" />
 
     </FrameLayout>
@@ -122,7 +120,6 @@
             android:layout_height="wrap_content"
             android:background="@null"
             android:contentDescription="@string/image_undo"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_undo" />
 
     </LinearLayout>
@@ -142,7 +139,6 @@
             android:layout_height="wrap_content"
             android:button="@drawable/image_btn_doodle"
             android:gravity="center"
-            android:onClick="onClick"
             android:textColor="#FFF" />
 
         <Space
@@ -156,7 +152,6 @@
             android:background="@null"
             android:contentDescription="@string/image_text"
             android:gravity="center"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_text"
             android:textColor="#FFF" />
 
@@ -170,7 +165,6 @@
             android:layout_height="wrap_content"
             android:button="@drawable/image_btn_mosaic"
             android:gravity="center"
-            android:onClick="onClick"
             android:textColor="#FFF" />
 
         <Space
@@ -184,7 +178,6 @@
             android:background="@null"
             android:contentDescription="@string/image_clip"
             android:gravity="center"
-            android:onClick="onClick"
             android:src="@drawable/image_btn_clip"
             android:textColor="#FFF" />
 
diff --git a/image/src/main/res/layout/image_text_dialog.xml b/image/src/main/res/layout/image_text_dialog.xml
index 33ba0c8c0..d6a0fc0aa 100644
--- a/image/src/main/res/layout/image_text_dialog.xml
+++ b/image/src/main/res/layout/image_text_dialog.xml
@@ -31,20 +31,26 @@
 
     </FrameLayout>
 
-    <EditText
-        android:id="@+id/et_text"
+    <com.google.android.material.textfield.TextInputLayout
+        style="@style/Widget.Material3.TextInputLayout.OutlinedBox"
         android:layout_width="match_parent"
         android:layout_height="0dp"
         android:layout_marginTop="22dp"
-        android:layout_weight="1"
-        android:background="@null"
-        android:gravity="top"
-        android:importantForAutofill="no"
-        android:inputType="textMultiLine"
-        android:padding="16dp"
-        android:textColor="@color/image_color_primary"
-        android:textSize="24sp"
-        tools:text="Sample" />
+        android:layout_weight="1">
+
+        <com.google.android.material.textfield.TextInputEditText
+            android:id="@+id/et_text"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            android:background="@null"
+            android:gravity="top"
+            android:importantForAutofill="no"
+            android:inputType="textMultiLine"
+            android:padding="16dp"
+            android:textColor="@color/image_color_primary"
+            android:textSize="24sp"
+            tools:text="Sample" />
+    </com.google.android.material.textfield.TextInputLayout>
 
     <include layout="@layout/image_color_layout" />
 
diff --git a/image/src/main/res/layout/ucrop_controls.xml b/image/src/main/res/layout/ucrop_controls.xml
index 531f7fed7..eb124745f 100644
--- a/image/src/main/res/layout/ucrop_controls.xml
+++ b/image/src/main/res/layout/ucrop_controls.xml
@@ -82,7 +82,7 @@
                 android:src="@drawable/ucrop_rotate" />
 
             <TextView
-                android:id="@+id/text_view_rotate"
+                android:id="@+id/text_view_rotate_info"
                 style="@style/ucrop_TextViewWidget"
                 android:text="@string/ucrop_rotate" />
 
@@ -99,7 +99,7 @@
                 android:src="@drawable/ucrop_scale" />
 
             <TextView
-                android:id="@+id/text_view_scale"
+                android:id="@+id/text_view_scale_info"
                 style="@style/ucrop_TextViewWidget"
                 android:text="@string/ucrop_scale" />
 
diff --git a/image/src/main/res/layout/ucrop_fragment_photobox.xml b/image/src/main/res/layout/ucrop_fragment_photobox.xml
deleted file mode 100644
index 70e97247e..000000000
--- a/image/src/main/res/layout/ucrop_fragment_photobox.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:id="@+id/ucrop_photobox"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent">
-
-    <FrameLayout
-        android:id="@+id/ucrop_frame"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:layout_above="@id/controls_wrapper"
-        android:layout_marginBottom="-12dp"
-        android:background="@color/ucrop_color_crop_background">
-
-        <ImageView
-            android:id="@+id/image_view_logo"
-            android:layout_width="@dimen/ucrop_default_crop_logo_size"
-            android:layout_height="@dimen/ucrop_default_crop_logo_size"
-            android:layout_gravity="center"
-            app:srcCompat="@drawable/ucrop_vector_ic_crop"
-            tools:background="@drawable/ucrop_vector_ic_crop"
-            tools:ignore="ContentDescription,MissingPrefix" />
-
-        <com.yalantis.ucrop.view.UCropView
-            android:id="@+id/ucrop"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:alpha="0" />
-
-    </FrameLayout>
-
-    <FrameLayout
-        android:id="@+id/controls_wrapper"
-        android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
-        android:visibility="gone" />
-
-</RelativeLayout>
diff --git a/image/src/main/res/values/image_colors.xml b/image/src/main/res/values/image_colors.xml
index 26e1d1263..cd8d6546a 100644
--- a/image/src/main/res/values/image_colors.xml
+++ b/image/src/main/res/values/image_colors.xml
@@ -16,7 +16,6 @@
     <color name="ucrop_color_black">#000</color>
     <color name="ucrop_color_blaze_orange">#673AB7</color>
     <color name="ucrop_color_ebony_clay">#20242F</color>
-    <color name="ucrop_color_heather">#B3BECE</color>
     <color name="ucrop_color_white">#FFF</color>
 
     <!--uCrop Activity-->
@@ -26,7 +25,6 @@
     <color name="ucrop_color_widget">@color/ucrop_color_black</color>
     <color name="ucrop_color_widget_active">@color/ucrop_color_white</color>
     <color name="ucrop_color_active_controls_color">@color/ucrop_color_blaze_orange</color>
-    <color name="ucrop_color_inactive_controls_color">@color/ucrop_color_heather</color>
     <color name="ucrop_color_widget_rotate_mid_line">@color/ucrop_color_blaze_orange</color>
     <color name="ucrop_color_widget_rotate_angle">@color/ucrop_color_ebony_clay</color>
     <color name="ucrop_color_active_aspect_ratio">@color/ucrop_color_blaze_orange</color>
diff --git a/libfenrir/build.gradle b/libfenrir/build.gradle
index c725d93e3..6e589b773 100644
--- a/libfenrir/build.gradle
+++ b/libfenrir/build.gradle
@@ -29,6 +29,7 @@ android {
             }
         }
         ndk {
+            //noinspection ChromeOsAbiSupport
             abiFilters "arm64-v8a", "armeabi-v7a", "x86_64"
         }
     }
diff --git a/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java b/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java
index d74de4433..c1ed0cfe7 100644
--- a/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java
+++ b/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java
@@ -37,7 +37,7 @@ long initDStream(long stream) {
     }
 
     @Override
-    long decompressStream(long stream, ByteBuffer dst, int dstOffset, int dstSize, ByteBuffer src, int srcOffset, int srcSize) {
+    long decompressStream(long stream, ByteBuffer dst, int dstBufPos, int dstSize, ByteBuffer src, int srcBufPos, int srcSize) {
         if (!src.hasArray()) {
             throw new IllegalArgumentException("provided source ByteBuffer lacks array");
         }
@@ -47,7 +47,10 @@ long decompressStream(long stream, ByteBuffer dst, int dstOffset, int dstSize, B
         byte[] targetArr = dst.array();
         byte[] sourceArr = src.array();
 
-        return decompressStreamNative(stream, targetArr, dstOffset, dstSize, sourceArr, srcOffset, srcSize);
+        // We are interested in array data corresponding to the pos represented by the ByteBuffer view.
+        // A ByteBuffer may share an underlying array with other ByteBuffers. In such scenario, we need to adjust the
+        // index of the array by adding an offset using arrayOffset().
+        return decompressStreamNative(stream, targetArr, dstBufPos + dst.arrayOffset(), dstSize, sourceArr, srcBufPos + src.arrayOffset(), srcSize);
     }
 
     public static int recommendedTargetBufferSize() {
diff --git a/libfenrir/src/main/jni/CMakeLists.txt b/libfenrir/src/main/jni/CMakeLists.txt
index 0b77cbd51..a83960000 100644
--- a/libfenrir/src/main/jni/CMakeLists.txt
+++ b/libfenrir/src/main/jni/CMakeLists.txt
@@ -123,49 +123,53 @@ target_include_directories(rlottie PRIVATE
         rlottie/src/vector/stb)
 
 add_library(libyuv STATIC
+        animation/libyuv/source/compare.cc
         animation/libyuv/source/compare_common.cc
         animation/libyuv/source/compare_gcc.cc
+        animation/libyuv/source/compare_msa.cc
         animation/libyuv/source/compare_neon64.cc
-        animation/libyuv/source/compare_win.cc
-        animation/libyuv/source/compare.cc
+        animation/libyuv/source/compare_neon.cc
         animation/libyuv/source/convert_argb.cc
+        animation/libyuv/source/convert.cc
         animation/libyuv/source/convert_from_argb.cc
         animation/libyuv/source/convert_from.cc
         animation/libyuv/source/convert_jpeg.cc
         animation/libyuv/source/convert_to_argb.cc
         animation/libyuv/source/convert_to_i420.cc
-        animation/libyuv/source/convert.cc
         animation/libyuv/source/cpu_id.cc
         animation/libyuv/source/mjpeg_decoder.cc
         animation/libyuv/source/mjpeg_validate.cc
         animation/libyuv/source/planar_functions.cc
         animation/libyuv/source/rotate_any.cc
         animation/libyuv/source/rotate_argb.cc
-        animation/libyuv/source/rotate_lsx.cc
+        animation/libyuv/source/rotate.cc
         animation/libyuv/source/rotate_common.cc
         animation/libyuv/source/rotate_gcc.cc
+        animation/libyuv/source/rotate_lsx.cc
+        animation/libyuv/source/rotate_msa.cc
         animation/libyuv/source/rotate_neon64.cc
-        animation/libyuv/source/rotate_win.cc
-        animation/libyuv/source/rotate.cc
+        animation/libyuv/source/rotate_neon.cc
         animation/libyuv/source/row_any.cc
-        animation/libyuv/source/row_rvv.cc
-        animation/libyuv/source/row_lasx.cc
-        animation/libyuv/source/row_lsx.cc
         animation/libyuv/source/row_common.cc
         animation/libyuv/source/row_gcc.cc
+        animation/libyuv/source/row_lasx.cc
+        animation/libyuv/source/row_lsx.cc
+        animation/libyuv/source/row_msa.cc
         animation/libyuv/source/row_neon64.cc
-        animation/libyuv/source/row_win.cc
+        animation/libyuv/source/row_neon.cc
+        animation/libyuv/source/row_rvv.cc
         animation/libyuv/source/scale_any.cc
         animation/libyuv/source/scale_argb.cc
+        animation/libyuv/source/scale.cc
         animation/libyuv/source/scale_common.cc
-        animation/libyuv/source/scale_lsx.cc
         animation/libyuv/source/scale_gcc.cc
+        animation/libyuv/source/scale_lsx.cc
+        animation/libyuv/source/scale_msa.cc
         animation/libyuv/source/scale_neon64.cc
-        animation/libyuv/source/scale_win.cc
-        animation/libyuv/source/scale.cc
-        animation/libyuv/source/video_common.cc
+        animation/libyuv/source/scale_neon.cc
         animation/libyuv/source/scale_rgb.cc
-        animation/libyuv/source/scale_uv.cc)
+        animation/libyuv/source/scale_uv.cc
+        animation/libyuv/source/video_common.cc)
 target_compile_options(libyuv PRIVATE
         -ffast-math ${OPTIMIZE_NORMAL} -funroll-loops -fno-strict-aliasing -fno-math-errno ${SYM_VISIBILITY})
 target_include_directories(libyuv PRIVATE
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h
index 4f6d2536e..88619a4f6 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h
@@ -367,6 +367,23 @@ int I212ToI422(const uint16_t* src_y,
                int width,
                int height);
 
+#define H212ToH420 I212ToI420
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define H412ToH444 I412ToI444
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
@@ -384,6 +401,23 @@ int I412ToI444(const uint16_t* src_y,
                int width,
                int height);
 
+#define H412ToH420 I412ToI420
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 #define I412ToI012 I410ToI010
 #define H410ToH010 I410ToI010
 #define H412ToH012 I410ToI010
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h
index 8e4562efc..35eeac9b2 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h
@@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
   I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
   I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
+#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \
+  I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k)
 #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
   I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
 #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
index 3488d2568..5b244d77e 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
@@ -807,6 +807,7 @@ extern "C" {
 #define HAS_ABGRTOYROW_RVV
 #define HAS_ABGRTOYJROW_RVV
 #define HAS_BGRATOYROW_RVV
+#define HAS_COPYROW_RVV
 #define HAS_I400TOARGBROW_RVV
 #define HAS_I422ALPHATOARGBROW_RVV
 #define HAS_I422TOARGBROW_RVV
@@ -815,12 +816,15 @@ extern "C" {
 #define HAS_I444ALPHATOARGBROW_RVV
 #define HAS_I444TOARGBROW_RVV
 #define HAS_I444TORGB24ROW_RVV
+#define HAS_INTERPOLATEROW_RVV
 #define HAS_J400TOARGBROW_RVV
 #define HAS_MERGEARGBROW_RVV
 #define HAS_MERGERGBROW_RVV
+#define HAS_MERGEUVROW_RVV
 #define HAS_MERGEXRGBROW_RVV
 #define HAS_SPLITARGBROW_RVV
 #define HAS_SPLITRGBROW_RVV
+#define HAS_SPLITUVROW_RVV
 #define HAS_SPLITXRGBROW_RVV
 #define HAS_RAWTOARGBROW_RVV
 #define HAS_RAWTORGB24ROW_RVV
@@ -832,9 +836,6 @@ extern "C" {
 #define HAS_RGB24TOYROW_RVV
 #define HAS_RGBATOYROW_RVV
 #define HAS_RGBATOYJROW_RVV
-#define HAS_SPLITARGBROW_RVV
-#define HAS_SPLITRGBROW_RVV
-#define HAS_SPLITXRGBROW_RVV
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -2242,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -2403,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
                          const uint8_t* uv_buf,
                          uint8_t* dst_ptr,
@@ -3038,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -3368,15 +3378,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width);
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3404,7 +3414,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -3417,15 +3427,15 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
                            int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
                                int width);
 void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
                                int width);
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width);
 
 void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
@@ -5858,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h
index 9568200e8..b6623dbbe 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1869
+#define LIBYUV_VERSION 1871
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc
deleted file mode 100644
index 7640d9468..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// Hakmem method for hamming distance.
-uint32_t HammingDistance_MMI(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
-  uint32_t diff = 0u;
-
-  uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
-  uint64_t c1 = 0x5555555555555555;
-  uint64_t c2 = 0x3333333333333333;
-  uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
-  uint32_t c4 = 0x01010101;
-  uint64_t s1 = 1, s2 = 2, s3 = 4;
-  __asm__ volatile(
-      "1:	\n\t"
-      "ldc1   %[ta],    0(%[src_a])          \n\t"
-      "ldc1   %[tb],    0(%[src_b])          \n\t"
-      "xor    %[temp],  %[ta],      %[tb]    \n\t"
-      "psrlw  %[temp1], %[temp],    %[s1]    \n\t"  // temp1=x>>1
-      "and    %[temp1], %[temp1],   %[c1]    \n\t"  // temp1&=c1
-      "psubw  %[temp1], %[temp],    %[temp1] \n\t"  // x-temp1
-      "and    %[temp],  %[temp1],   %[c2]    \n\t"  // t = (u&c2)
-      "psrlw  %[temp1], %[temp1],   %[s2]    \n\t"  // u>>2
-      "and    %[temp1], %[temp1],   %[c2]    \n\t"  // u>>2 & c2
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // t1 = t1+t
-      "psrlw  %[temp],  %[temp1],   %[s3]    \n\t"  // u>>4
-      "paddw  %[temp1], %[temp1],   %[temp]  \n\t"  // u+(u>>4)
-      "and    %[temp1], %[temp1],   %[c3]    \n\t"  //&c3
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "dsrl32 $t0,      $t0,        0        \n\t "
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "dmfc1  $t0,      %[temp1]             \n\t"
-      "mul    $t0,      $t0,        %[c4]    \n\t"
-      "dsrl   $t0,      $t0,        24       \n\t"
-      "dadd   %[diff],  %[diff],    $t0      \n\t"
-      "daddiu %[src_a], %[src_a],   8        \n\t"
-      "daddiu %[src_b], %[src_b],   8        \n\t"
-      "addiu  %[count], %[count],  -8        \n\t"
-      "bgtz   %[count], 1b \n\t"
-      "nop                            \n\t"
-      : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
-        [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
-        [temp1] "+f"(temp1)
-      : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
-        [s2] "f"(s2), [s3] "f"(s3)
-      : "memory");
-  return diff;
-}
-
-uint32_t SumSquareError_MMI(const uint8_t* src_a,
-                            const uint8_t* src_b,
-                            int count) {
-  uint32_t sse = 0u;
-  uint32_t sse_hi = 0u, sse_lo = 0u;
-
-  uint64_t src1, src2;
-  uint64_t diff, diff_hi, diff_lo;
-  uint64_t sse_sum, sse_tmp;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "xor        %[sse_sum],      %[sse_sum],        %[sse_sum]    \n\t"
-
-      "1:                                                           \n\t"
-      "ldc1       %[src1],         0x00(%[src_a])                   \n\t"
-      "ldc1       %[src2],         0x00(%[src_b])                   \n\t"
-      "pasubub    %[diff],         %[src1],           %[src2]       \n\t"
-      "punpcklbh  %[diff_lo],      %[diff],           %[mask]       \n\t"
-      "punpckhbh  %[diff_hi],      %[diff],           %[mask]       \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_lo],        %[diff_lo]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-      "pmaddhw    %[sse_tmp],      %[diff_hi],        %[diff_hi]    \n\t"
-      "paddw      %[sse_sum],      %[sse_sum],        %[sse_tmp]    \n\t"
-
-      "daddiu     %[src_a],        %[src_a],          0x08          \n\t"
-      "daddiu     %[src_b],        %[src_b],          0x08          \n\t"
-      "daddiu     %[count],        %[count],         -0x08          \n\t"
-      "bnez       %[count],        1b                               \n\t"
-
-      "mfc1       %[sse_lo],       %[sse_sum]                       \n\t"
-      "mfhc1      %[sse_hi],       %[sse_sum]                       \n\t"
-      "daddu      %[sse],          %[sse_hi],         %[sse_lo]     \n\t"
-      : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
-        [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
-        [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
-        [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
-      : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
-        [mask] "f"(mask)
-      : "memory");
-
-  return sse;
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc
deleted file mode 100644
index 9bb27f1dd..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
-#include "libyuv/row.h"
-
-#if defined(_MSC_VER)
-#include <intrin.h>  // For __popcnt
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
-  uint32_t diff = 0u;
-
-  int i;
-  for (i = 0; i < count - 3; i += 4) {
-    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
-    src_a += 4;
-    src_b += 4;
-    diff += __popcnt(x);
-  }
-  return diff;
-}
-
-__declspec(naked) uint32_t
-    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]  // src_a
-    mov        edx, [esp + 8]  // src_b
-    mov        ecx, [esp + 12]  // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-
-  wloop:
-    movdqu     xmm1, [eax]
-    lea        eax,  [eax + 16]
-    movdqu     xmm2, [edx]
-    lea        edx,  [edx + 16]
-    movdqa     xmm3, xmm1  // abs trick
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    sub        ecx, 16
-    jg         wloop
-
-    pshufd     xmm1, xmm0, 0xee
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 0x01
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
-  }
-}
-
-#ifdef HAS_SUMSQUAREERROR_AVX2
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable : 4752)
-__declspec(naked) uint32_t
-    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]  // src_a
-    mov        edx, [esp + 8]  // src_b
-    mov        ecx, [esp + 12]  // count
-    vpxor      ymm0, ymm0, ymm0  // sum
-    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
-    sub        edx, eax
-
-  wloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + edx]
-    lea        eax,  [eax + 32]
-    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
-    vpsubusb   ymm2, ymm2, ymm1
-    vpor       ymm1, ymm2, ymm3
-    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
-    vpunpckhbw ymm1, ymm1, ymm5
-    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
-    vpmaddwd   ymm1, ymm1, ymm1
-    vpaddd     ymm0, ymm0, ymm1
-    vpaddd     ymm0, ymm0, ymm2
-    sub        ecx, 32
-    jg         wloop
-
-    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpermq     ymm1, ymm0, 0x02  // high + low lane.
-    vpaddd     ymm0, ymm0, ymm1
-    vmovd      eax, xmm0
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SUMSQUAREERROR_AVX2
-
-uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
-uvec32 kHashMul0 = {
-    0x0c3525e1,  // 33 ^ 15
-    0xa3476dc1,  // 33 ^ 14
-    0x3b4039a1,  // 33 ^ 13
-    0x4f5f0981,  // 33 ^ 12
-};
-uvec32 kHashMul1 = {
-    0x30f35d61,  // 33 ^ 11
-    0x855cb541,  // 33 ^ 10
-    0x040a9121,  // 33 ^ 9
-    0x747c7101,  // 33 ^ 8
-};
-uvec32 kHashMul2 = {
-    0xec41d4e1,  // 33 ^ 7
-    0x4cfa3cc1,  // 33 ^ 6
-    0x025528a1,  // 33 ^ 5
-    0x00121881,  // 33 ^ 4
-};
-uvec32 kHashMul3 = {
-    0x00008c61,  // 33 ^ 3
-    0x00000441,  // 33 ^ 2
-    0x00000021,  // 33 ^ 1
-    0x00000001,  // 33 ^ 0
-};
-
-__declspec(naked) uint32_t
-    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        ecx, [esp + 8]  // count
-    movd       xmm0, [esp + 12]  // seed
-
-    pxor       xmm7, xmm7  // constant 0 for unpck
-    movdqa     xmm6, xmmword ptr kHash16x33
-
-  wloop:
-    movdqu     xmm1, [eax]  // src[0-15]
-    lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
-    movdqa     xmm5, xmmword ptr kHashMul0
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7  // src[0-7]
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7  // src[0-3]
-    pmulld     xmm3, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul1
-    movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7  // src[4-7]
-    pmulld     xmm4, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7  // src[8-15]
-    movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7  // src[8-11]
-    pmulld     xmm2, xmm5
-    movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7  // src[12-15]
-    pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4  // add 16 results
-    paddd      xmm1, xmm2
-    paddd      xmm1, xmm3
-
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    movd       eax, xmm0  // return hash
-    ret
-  }
-}
-
-// Visual C 2012 required for AVX2.
-#ifdef HAS_HASHDJB2_AVX2
-__declspec(naked) uint32_t
-    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        ecx, [esp + 8]  // count
-    vmovd      xmm0, [esp + 12]  // seed
-
-  wloop:
-    vpmovzxbd  xmm3, [eax]  // src[0-3]
-    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
-    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
-    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
-    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
-    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
-    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
-    lea        eax, [eax + 16]
-    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4  // add 16 results
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm1, xmm1, xmm3
-    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
-    vpaddd     xmm1, xmm1,xmm2
-    vpshufd    xmm2, xmm1, 0x01
-    vpaddd     xmm1, xmm1, xmm2
-    vpaddd     xmm0, xmm0, xmm1
-    sub        ecx, 16
-    jg         wloop
-
-    vmovd      eax, xmm0  // return hash
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HASHDJB2_AVX2
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert.cc
index 075428d09..b11ab1bff 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/convert.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/convert.cc
@@ -203,6 +203,99 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
   return 0;
 }
 
+static int I41xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
+                          dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
+    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
+                          dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
+static int I21xToI420(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int depth) {
+  const int scale = 1 << (24 - depth);
+
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  {
+    const int uv_width = SUBSAMPLE(width, 1, 1);
+    const int uv_height = SUBSAMPLE(height, 1, 1);
+    const int dy = FixedDiv(height, uv_height);
+
+    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                      height);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
+                             /*bpp=*/1, scale, kFilterBilinear);
+  }
+  return 0;
+}
+
 // Convert 10 bit YUV to 8 bit.
 LIBYUV_API
 int I010ToI420(const uint16_t* src_y,
@@ -240,38 +333,9 @@ int I210ToI420(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
-  const int depth = 10;
-  const int scale = 1 << (24 - depth);
-
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  {
-    const int uv_width = SUBSAMPLE(width, 1, 1);
-    const int uv_height = SUBSAMPLE(height, 1, 1);
-    const int dy = FixedDiv(height, uv_height);
-
-    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
-                      height);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
-                             dst_stride_u, src_u, dst_u, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-    ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
-                             dst_stride_v, src_v, dst_v, 0, 32768, dy,
-                             /*bpp=*/1, scale, kFilterBilinear);
-  }
-  return 0;
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
 }
 
 LIBYUV_API
@@ -310,35 +374,9 @@ int I410ToI420(const uint16_t* src_y,
                int dst_stride_v,
                int width,
                int height) {
-  const int depth = 10;
-  const int scale = 1 << (24 - depth);
-
-  if (width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  {
-    const int uv_width = SUBSAMPLE(width, 1, 1);
-    const int uv_height = SUBSAMPLE(height, 1, 1);
-
-    Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
-                      height);
-    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u,
-                          dst_stride_u, src_u, dst_u, scale, kFilterBilinear);
-    ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v,
-                          dst_stride_v, src_v, dst_v, scale, kFilterBilinear);
-  }
-  return 0;
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 10);
 }
 
 LIBYUV_API
@@ -404,6 +442,26 @@ int I212ToI422(const uint16_t* src_y,
                            0, 12);
 }
 
+LIBYUV_API
+int I212ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
 LIBYUV_API
 int I412ToI444(const uint16_t* src_y,
                int src_stride_y,
@@ -425,6 +483,26 @@ int I412ToI444(const uint16_t* src_y,
                            0, 12);
 }
 
+LIBYUV_API
+int I412ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 12);
+}
+
 // Any Ix10 To I010 format with mirroring.
 static int Ix10ToI010(const uint16_t* src_y,
                       int src_stride_y,
@@ -955,6 +1033,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -995,6 +1078,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
@@ -1922,7 +2010,8 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
       ARGBToYRow_C;
   void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
                               int width) = ARGBExtractAlphaRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v  || !dst_a || width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2045,7 +2134,8 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
     ARGBToYRow(src_argb, dst_y, width);
     ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
     ARGBExtractAlphaRow(src_argb, dst_a, width);
-    ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a, width);
+    ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
+                        width);
     src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
index f16c368d0..cc6560de6 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
@@ -5645,7 +5645,7 @@ int I420ToRGB565Dither(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc
index 6e05876a0..c3d037c47 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc
@@ -453,6 +453,11 @@ int ARGBToNV12(const uint8_t* src_argb,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -646,6 +651,11 @@ int ARGBToNV21(const uint8_t* src_argb,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -826,6 +836,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -1007,6 +1022,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a rows of uv.
@@ -1721,7 +1741,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
                        int height) {
   int y;
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
-                                const uint32_t dither4, int width) =
+                                uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
@@ -3203,6 +3223,11 @@ int RAWToJNV21(const uint8_t* src_raw,
       MergeUVRow_ = MergeUVRow_LSX;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow_ = MergeUVRow_RVV;
+  }
 #endif
   {
     // Allocate a row of uv.
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
index e741dc509..d115a2a10 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
@@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -545,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Copy a row of UV.
@@ -631,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -4348,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -5560,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    SplitUVRow = SplitUVRow_RVV;
+  }
+#endif
+
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -5600,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -5665,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u,
     HalfMergeUVRow = HalfMergeUVRow_AVX2;
   }
 #endif
+
   for (y = 0; y < height - 1; y += 2) {
     // Merge a row of U and V into a row of UV.
     HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc
index 6797ff02b..8d3978c71 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc
@@ -214,6 +214,11 @@ void RotatePlane180(const uint8_t* src,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc
index 9667f34c2..c72390108 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc
@@ -192,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
+#if defined(HAS_COPYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    CopyRow = CopyRow_RVV;
+  }
+#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc
deleted file mode 100644
index f8de60834..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-void TransposeWx8_MMI(const uint8_t* src,
-                      int src_stride,
-                      uint8_t* dst,
-                      int dst_stride,
-                      int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (00 10 01 11 02 12 03 13) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (04 14 05 15 06 16 07 17) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (20 30 21 31 22 32 23 33) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (24 34 25 35 26 36 27 37) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (00 10 20 30 01 11 21 31) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (02 12 22 32 03 13 23 33) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (04 14 24 34 05 15 25 35) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (06 16 26 36 07 17 27 37) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (40 50 41 51 42 52 43 53) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (44 54 45 55 46 56 47 57) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (60 70 61 71 62 72 63 73) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (64 74 65 75 66 76 67 77) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (40 50 60 70 41 51 61 71) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (42 52 62 72 43 53 63 73) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (44 54 64 74 45 55 65 75) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (46 56 66 76 47 57 67 77) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (00 10 20 30 40 50 60 70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (01 11 21 31 41 51 61 71) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (02 12 22 32 42 52 62 72) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (03 13 23 33 43 53 63 73) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (04 14 24 34 44 54 64 74) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (05 15 25 35 45 55 65 75) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      /* tmp0 = (06 16 26 36 46 56 66 76) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (07 17 27 37 47 57 67 77) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst])                     \n\t"
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst])                     \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst])                     \n\t"
-
-      "dadd       %[dst],          %[dst],         %[dst_stride]    \n\t"
-      "daddi      %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x08            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
-        [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
-        [dst_stride] "r"(dst_stride)
-      : "memory");
-}
-
-void TransposeUVWx8_MMI(const uint8_t* src,
-                        int src_stride,
-                        uint8_t* dst_a,
-                        int dst_stride_a,
-                        uint8_t* dst_b,
-                        int dst_stride_b,
-                        int width) {
-  uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
-  uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
-  uint8_t* src_tmp = nullptr;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
-      "ldc1       %[tmp12],        0x00(%[src])                     \n\t"
-      "dadd       %[src_tmp],      %[src],         %[src_stride]    \n\t"
-      /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                  \n\t"
-
-      /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
-      "punpcklhw  %[tmp4],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
-      "punpckhhw  %[tmp5],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
-      "punpcklhw  %[tmp6],         %[tmp1],        %[tmp3]          \n\t"
-      /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
-      "punpckhhw  %[tmp7],         %[tmp1],        %[tmp3]          \n\t"
-
-      "dadd       %[src_tmp],     %[src_tmp],      %[src_stride]    \n\t"
-      /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
-      "punpcklbh  %[tmp0],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
-      "punpckhbh  %[tmp1],         %[tmp12],       %[tmp13]         \n\t"
-
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
-      "ldc1       %[tmp12],        0x00(%[src_tmp])                 \n\t"
-      /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
-      "dadd       %[src_tmp],      %[src_tmp],     %[src_stride]    \n\t"
-      "ldc1       %[tmp13],        0x00(%[src_tmp])                 \n\t"
-
-      /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
-      "punpcklbh  %[tmp2],         %[tmp12],       %[tmp13]         \n\t"
-      /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
-      "punpckhbh  %[tmp3],         %[tmp12],       %[tmp13]         \n\t"
-
-      /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
-      "punpcklhw  %[tmp8],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
-      "punpckhhw  %[tmp9],         %[tmp0],        %[tmp2]          \n\t"
-      /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
-      "punpcklhw  %[tmp10],        %[tmp1],        %[tmp3]          \n\t"
-      /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
-      "punpckhhw  %[tmp11],        %[tmp1],        %[tmp3]          \n\t"
-
-      /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
-      "punpcklwd  %[tmp0],         %[tmp4],        %[tmp8]          \n\t"
-      /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
-      "punpckhwd  %[tmp1],         %[tmp4],        %[tmp8]          \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
-      "punpcklwd  %[tmp0],         %[tmp5],        %[tmp9]          \n\t"
-      /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
-      "punpckhwd  %[tmp1],         %[tmp5],        %[tmp9]          \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
-      "punpcklwd  %[tmp0],         %[tmp6],        %[tmp10]         \n\t"
-      /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
-      "punpckhwd  %[tmp1],         %[tmp6],        %[tmp10]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
-      "punpcklwd  %[tmp0],         %[tmp7],        %[tmp11]         \n\t"
-      /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
-      "punpckhwd  %[tmp1],         %[tmp7],        %[tmp11]         \n\t"
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "gssdlc1    %[tmp0],         0x07(%[dst_a])                   \n\t"
-      "gssdrc1    %[tmp0],         0x00(%[dst_a])                   \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "gssdlc1    %[tmp1],         0x07(%[dst_b])                   \n\t"
-      "gssdrc1    %[tmp1],         0x00(%[dst_b])                   \n\t"
-
-      "dadd       %[dst_a],        %[dst_a],       %[dst_stride_a]  \n\t"
-      "dadd       %[dst_b],        %[dst_b],       %[dst_stride_b]  \n\t"
-      "daddiu     %[src],          %[src],          0x08            \n\t"
-      "daddi      %[width],        %[width],       -0x04            \n\t"
-      "bnez       %[width],        1b                               \n\t"
-
-      : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
-        [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
-        [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
-        [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
-        [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
-      : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
-        [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
-      : "memory");
-}
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc
deleted file mode 100644
index a78873f84..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
-                                          int src_stride,
-                                          uint8_t* dst,
-                                          int dst_stride,
-                                          int width) {
-  __asm {
-    push      edi
-    push      esi
-    push      ebp
-    mov       eax, [esp + 12 + 4]  // src
-    mov       edi, [esp + 12 + 8]  // src_stride
-    mov       edx, [esp + 12 + 12]  // dst
-    mov       esi, [esp + 12 + 16]  // dst_stride
-    mov       ecx, [esp + 12 + 20]  // width
-
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    align      4
- convertloop:
-    movq      xmm0, qword ptr [eax]
-    lea       ebp, [eax + 8]
-    movq      xmm1, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm0, xmm1
-    movq      xmm2, qword ptr [eax]
-    movdqa    xmm1, xmm0
-    palignr   xmm1, xmm1, 8
-    movq      xmm3, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm2, xmm3
-    movdqa    xmm3, xmm2
-    movq      xmm4, qword ptr [eax]
-    palignr   xmm3, xmm3, 8
-    movq      xmm5, qword ptr [eax + edi]
-    punpcklbw xmm4, xmm5
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm5, xmm4
-    movq      xmm6, qword ptr [eax]
-    palignr   xmm5, xmm5, 8
-    movq      xmm7, qword ptr [eax + edi]
-    punpcklbw xmm6, xmm7
-    mov       eax, ebp
-    movdqa    xmm7, xmm6
-    palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
-    punpcklwd xmm0, xmm2
-    punpcklwd xmm1, xmm3
-    movdqa    xmm2, xmm0
-    movdqa    xmm3, xmm1
-    palignr   xmm2, xmm2, 8
-    palignr   xmm3, xmm3, 8
-    punpcklwd xmm4, xmm6
-    punpcklwd xmm5, xmm7
-    movdqa    xmm6, xmm4
-    movdqa    xmm7, xmm5
-    palignr   xmm6, xmm6, 8
-    palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    punpckldq xmm0, xmm4
-    movq      qword ptr [edx], xmm0
-    movdqa    xmm4, xmm0
-    palignr   xmm4, xmm4, 8
-    movq      qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    punpckldq xmm2, xmm6
-    movdqa    xmm6, xmm2
-    palignr   xmm6, xmm6, 8
-    movq      qword ptr [edx], xmm2
-    punpckldq xmm1, xmm5
-    movq      qword ptr [edx + esi], xmm6
-    lea       edx, [edx + 2 * esi]
-    movdqa    xmm5, xmm1
-    movq      qword ptr [edx], xmm1
-    palignr   xmm5, xmm5, 8
-    punpckldq xmm3, xmm7
-    movq      qword ptr [edx + esi], xmm5
-    lea       edx, [edx + 2 * esi]
-    movq      qword ptr [edx], xmm3
-    movdqa    xmm7, xmm3
-    palignr   xmm7, xmm7, 8
-    sub       ecx, 8
-    movq      qword ptr [edx + esi], xmm7
-    lea       edx, [edx + 2 * esi]
-    jg        convertloop
-
-    pop       ebp
-    pop       esi
-    pop       edi
-    ret
-  }
-}
-
-__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
-                                           int src_stride,
-                                           uint8_t* dst_a,
-                                           int dst_stride_a,
-                                           uint8_t* dst_b,
-                                           int dst_stride_b,
-                                           int w) {
-  __asm {
-    push      ebx
-    push      esi
-    push      edi
-    push      ebp
-    mov       eax, [esp + 16 + 4]  // src
-    mov       edi, [esp + 16 + 8]  // src_stride
-    mov       edx, [esp + 16 + 12]  // dst_a
-    mov       esi, [esp + 16 + 16]  // dst_stride_a
-    mov       ebx, [esp + 16 + 20]  // dst_b
-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
-    mov       ecx, esp
-    sub       esp, 4 + 16
-    and       esp, ~15
-    mov       [esp + 16], ecx
-    mov       ecx, [ecx + 16 + 28]  // w
-
-    align      4
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-  convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
-    punpcklbw xmm0, xmm1
-    punpckhbw xmm7, xmm1
-    movdqa    xmm1, xmm7
-    movdqu    xmm2, [eax]
-    movdqu    xmm3, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm2
-    punpcklbw xmm2, xmm3
-    punpckhbw xmm7, xmm3
-    movdqa    xmm3, xmm7
-    movdqu    xmm4, [eax]
-    movdqu    xmm5, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm4
-    punpcklbw xmm4, xmm5
-    punpckhbw xmm7, xmm5
-    movdqa    xmm5, xmm7
-    movdqu    xmm6, [eax]
-    movdqu    xmm7, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqu    [esp], xmm5  // backup xmm5
-    neg       edi
-    movdqa    xmm5, xmm6  // use xmm5 as temp register.
-    punpcklbw xmm6, xmm7
-    punpckhbw xmm5, xmm7
-    movdqa    xmm7, xmm5
-    lea       eax, [eax + 8 * edi + 16]
-    neg       edi
-        // Second round of bit swap.
-    movdqa    xmm5, xmm0
-    punpcklwd xmm0, xmm2
-    punpckhwd xmm5, xmm2
-    movdqa    xmm2, xmm5
-    movdqa    xmm5, xmm1
-    punpcklwd xmm1, xmm3
-    punpckhwd xmm5, xmm3
-    movdqa    xmm3, xmm5
-    movdqa    xmm5, xmm4
-    punpcklwd xmm4, xmm6
-    punpckhwd xmm5, xmm6
-    movdqa    xmm6, xmm5
-    movdqu    xmm5, [esp]  // restore xmm5
-    movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5  // use xmm6 as temp register.
-    punpcklwd xmm5, xmm7
-    punpckhwd xmm6, xmm7
-    movdqa    xmm7, xmm6
-
-        // Third round of bit swap.
-        // Write to the destination pointer.
-    movdqa    xmm6, xmm0
-    punpckldq xmm0, xmm4
-    punpckhdq xmm6, xmm4
-    movdqa    xmm4, xmm6
-    movdqu    xmm6, [esp]  // restore xmm6
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [ebx], xmm0
-    movlpd    qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm4
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
-    punpckldq xmm2, xmm6
-    movlpd    qword ptr [edx], xmm2
-    movhpd    qword ptr [ebx], xmm2
-    punpckhdq xmm0, xmm6
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
-    punpckldq xmm1, xmm5
-    movlpd    qword ptr [edx], xmm1
-    movhpd    qword ptr [ebx], xmm1
-    punpckhdq xmm0, xmm5
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
-    punpckldq xmm3, xmm7
-    movlpd    qword ptr [edx], xmm3
-    movhpd    qword ptr [ebx], xmm3
-    punpckhdq xmm0, xmm7
-    sub       ecx, 8
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    jg        convertloop
-
-    mov       esp, [esp + 16]
-    pop       ebp
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-  }
-}
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc
index b7655b7b1..8be37fb58 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc
@@ -342,7 +342,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
 // or the upper byte for big endian.
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
-                             const uint32_t dither4,
+                             uint32_t dither4,
                              int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc
index aa4c0d11e..e94fd04df 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc
@@ -738,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "movd        %3,%%xmm6                     \n"
@@ -786,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vbroadcastss %3,%%xmm6                    \n"
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
index 29ac9254d..1082ad80b 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
@@ -1182,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   int x;
   int len = width / 16;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
index 573fc94de..e626072a9 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
@@ -794,10 +794,10 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0,
   __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
   __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080};
   for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32,
-              src_argb0, 48, src0, src1, src2, src3);
-    DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32,
-              src_argb1, 48, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
+              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
+              48, src4, src5, src6, src7);
     vec0 = __lsx_vaddwev_h_bu(src0, src4);
     vec1 = __lsx_vaddwev_h_bu(src1, src5);
     vec2 = __lsx_vaddwev_h_bu(src2, src6);
@@ -846,8 +846,8 @@ void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   __m128i tmp0, tmp1, tmp2, tmp3;
   __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
   for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb,
-              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
     tmp0 = __lsx_vshuf_b(src0, src0, shuf);
     tmp1 = __lsx_vshuf_b(src1, src1, shuf);
     tmp2 = __lsx_vshuf_b(src2, src2, shuf);
@@ -879,8 +879,8 @@ void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   __m128i tmp0, tmp1, tmp2, tmp3;
   __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
   for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb,
-              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
     tmp0 = __lsx_vshuf_b(src0, src0, shuf);
     tmp1 = __lsx_vshuf_b(src1, src1, shuf);
     tmp2 = __lsx_vshuf_b(src2, src2, shuf);
@@ -905,9 +905,7 @@ void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   __lsx_vst(tmp3, dst_rgb, 0);
 }
 
-void ARGBToRGB565Row_LSX(const uint8_t* src_argb,
-                         uint8_t* dst_rgb,
-                         int width) {
+void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   int len = width / 8;
   __m128i zero = __lsx_vldi(0);
@@ -995,8 +993,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
   __m128i const_18 = __lsx_vldi(18);
   __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080};
   for (x = 0; x < len; x++) {
-    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb,
-              48, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+              src0, src1, src2, src3);
     tmp0 = __lsx_vpickev_h(src1, src0);
     tmp1 = __lsx_vpickod_h(src1, src0);
     tmp2 = __lsx_vpickev_h(src3, src2);
@@ -1138,7 +1136,7 @@ void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
-                               const uint32_t dither4,
+                               uint32_t dither4,
                                int width) {
   int x;
   int len = width / 8;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc
deleted file mode 100644
index 362fd1cfc..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc
+++ /dev/null
@@ -1,7842 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "libyuv/row.h"
-
-#include <string.h>  // For memcpy and memset.
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
-                        uint8_t* dst_argb,
-                        int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask = 0xff000000ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask]       \n\t"
-      "or         %[src1],         %[src1],           %[mask]       \n\t"
-      "punpcklwd  %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  uint64_t src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xff000000ULL;
-  const uint64_t mask2 = 0xc6;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src0],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x09(%[src_ptr])                 \n\t"
-
-      "or         %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpcklbh  %[src0],         %[src0],           %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask2]      \n\t"
-      "or         %[src1],         %[src1],           %[mask1]      \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask2]      \n\t"
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
-      : "memory");
-}
-
-void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x6c;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_raw])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_raw])                 \n\t"
-      "gslwrc1    %[src1],         0x08(%[src_raw])                 \n\t"
-      "gslwlc1    %[src1],         0x0b(%[src_raw])                 \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[src1],         %[src1],           %[zero]       \n\t"
-      "pextrh     %[ftmp2],        %[ftmp0],          %[three]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[ftmp3]      \n\t"
-      "pextrh     %[ftmp3],        %[ftmp1],          %[two]        \n\t"
-      "pinsrh_1   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pextrh     %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[ftmp2]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[ftmp3]      \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb24])               \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb24])               \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb24])               \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb24])               \n\t"
-
-      "daddiu     %[src_raw],      %[src_raw],        0x0c          \n\t"
-      "daddiu     %[dst_rgb24],    %[dst_rgb24],      0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
-      : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
-      : "memory");
-}
-
-void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t ftmp[5];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                      \n\t"
-      "gsldrc1   %[src0],       0x00(%[src_rgb565])            \n\t"
-      "gsldlc1   %[src0],       0x07(%[src_rgb565])            \n\t"
-      "psrlh     %[src1],       %[src0],             %[eight]  \n\t"
-      "and       %[b],          %[src0],             %[c0]     \n\t"
-      "and       %[src0],       %[src0],             %[c1]     \n\t"
-      "psrlh     %[src0],       %[src0],             %[five]   \n\t"
-      "and       %[g],          %[src1],             %[c2]     \n\t"
-      "psllh     %[g],          %[g],                %[three]  \n\t"
-      "or        %[g],          %[src0],             %[g]      \n\t"
-      "psrlh     %[r],          %[src1],             %[three]  \n\t"
-      "psllh     %[src0],       %[b],                %[three]  \n\t"
-      "psrlh     %[src1],       %[b],                %[two]    \n\t"
-      "or        %[b],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[g],                %[two]    \n\t"
-      "psrlh     %[src1],       %[g],                %[four]   \n\t"
-      "or        %[g],          %[src0],             %[src1]   \n\t"
-      "psllh     %[src0],       %[r],                %[three]  \n\t"
-      "psrlh     %[src1],       %[r],                %[two]    \n\t"
-      "or        %[r],          %[src0],             %[src1]   \n\t"
-      "packushb  %[b],          %[b],                %[r]      \n\t"
-      "packushb  %[g],          %[g],                %[c1]     \n\t"
-      "punpcklbh %[src0],       %[b],                %[g]      \n\t"
-      "punpckhbh %[src1],       %[b],                %[g]      \n\t"
-      "punpcklhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x00(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x07(%[dst_argb])              \n\t"
-      "punpckhhw %[r],          %[src0],             %[src1]   \n\t"
-      "gssdrc1   %[r],          0x08(%[dst_argb])              \n\t"
-      "gssdlc1   %[r],          0x0f(%[dst_argb])              \n\t"
-      "daddiu    %[src_rgb565], %[src_rgb565],       0x08      \n\t"
-      "daddiu    %[dst_argb],   %[dst_argb],         0x10      \n\t"
-      "daddiu    %[width],      %[width],           -0x04      \n\t"
-      "bgtz      %[width],     1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
-      : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  uint64_t c4 = 0x0001000100010001;
-  __asm__ volatile(
-      "1:                                                         \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb1555])           \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb1555])           \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]  \n\t"
-      "and       %[b],            %[src0],              %[c0]     \n\t"
-      "and       %[src0],         %[src0],              %[c1]     \n\t"
-      "psrlh     %[src0],         %[src0],              %[five]   \n\t"
-      "and       %[g],            %[src1],              %[c2]     \n\t"
-      "psllh     %[g],            %[g],                 %[three]  \n\t"
-      "or        %[g],            %[src0],              %[g]      \n\t"
-      "and       %[r],            %[src1],              %[c3]     \n\t"
-      "psrlh     %[r],            %[r],                 %[two]    \n\t"
-      "psrlh     %[a],            %[src1],              %[seven]  \n\t"
-      "psllh     %[src0],         %[b],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[b],                 %[two]    \n\t"
-      "or        %[b],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[g],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[g],                 %[two]    \n\t"
-      "or        %[g],            %[src0],              %[src1]   \n\t"
-      "psllh     %[src0],         %[r],                 %[three]  \n\t"
-      "psrlh     %[src1],         %[r],                 %[two]    \n\t"
-      "or        %[r],            %[src0],              %[src1]   \n\t"
-      "xor       %[a],            %[a],                 %[c1]     \n\t"
-      "paddb     %[a],            %[a],                 %[c4]     \n\t"
-      "packushb  %[b],            %[b],                 %[r]      \n\t"
-      "packushb  %[g],            %[g],                 %[a]      \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]      \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]      \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])               \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]   \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])               \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])               \n\t"
-      "daddiu    %[src_argb1555], %[src_argb1555],      0x08      \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10      \n\t"
-      "daddiu    %[width],        %[width],            -0x04      \n\t"
-      "bgtz      %[width],        1b                              \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
-                           uint8_t* dst_argb,
-                           int width) {
-  uint64_t ftmp[6];
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                          \n\t"
-      "gsldrc1   %[src0],         0x00(%[src_argb4444])            \n\t"
-      "gsldlc1   %[src0],         0x07(%[src_argb4444])            \n\t"
-      "psrlh     %[src1],         %[src0],              %[eight]   \n\t"
-      "and       %[b],            %[src0],              %[c0]      \n\t"
-      "and       %[src0],         %[src0],              %[c1]      \n\t"
-      "psrlh     %[g],            %[src0],              %[four]    \n\t"
-      "and       %[r],            %[src1],              %[c0]      \n\t"
-      "psrlh     %[a],            %[src1],              %[four]    \n\t"
-      "psllh     %[src0],         %[b],                 %[four]    \n\t"
-      "or        %[b],            %[src0],              %[b]       \n\t"
-      "psllh     %[src0],         %[g],                 %[four]    \n\t"
-      "or        %[g],            %[src0],              %[g]       \n\t"
-      "psllh     %[src0],         %[r],                 %[four]    \n\t"
-      "or        %[r],            %[src0],              %[r]       \n\t"
-      "psllh     %[src0],         %[a],                 %[four]    \n\t"
-      "or        %[a],            %[src0],              %[a]       \n\t"
-      "packushb  %[b],            %[b],                 %[r]       \n\t"
-      "packushb  %[g],            %[g],                 %[a]       \n\t"
-      "punpcklbh %[src0],         %[b],                 %[g]       \n\t"
-      "punpckhbh %[src1],         %[b],                 %[g]       \n\t"
-      "punpcklhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x00(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x07(%[dst_argb])                \n\t"
-      "punpckhhw %[r],            %[src0],              %[src1]    \n\t"
-      "gssdrc1   %[r],            0x08(%[dst_argb])                \n\t"
-      "gssdlc1   %[r],            0x0f(%[dst_argb])                \n\t"
-      "daddiu    %[src_argb4444], %[src_argb4444],      0x08       \n\t"
-      "daddiu    %[dst_argb],     %[dst_argb],          0x10       \n\t"
-      "daddiu    %[width],        %[width],            -0x04       \n\t"
-      "bgtz      %[width],        1b                               \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
-        [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
-      : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
-        [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x00(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x04(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x03(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0b(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x06(%[dst_ptr])                 \n\t"
-
-      "gslwlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x0c(%[src_ptr])                 \n\t"
-      "gsswlc1    %[src],          0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[src],          0x09(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x0c          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t mask0 = 0xc6;
-  uint64_t mask1 = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[ftmp0],        %[src0],           %[zero]       \n\t"
-      "pshufh     %[ftmp0],        %[ftmp0],          %[mask0]      \n\t"
-      "punpckhbh  %[ftmp1],        %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[ftmp2],        %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]       \n\t"
-
-      "pextrh     %[src0],         %[ftmp1],          %[two]        \n\t"
-      "pinsrh_3   %[ftmp0],        %[ftmp0],          %[src0]       \n\t"
-      "pshufh     %[ftmp1],        %[ftmp1],          %[one]        \n\t"
-
-      "pextrh     %[src0],         %[ftmp2],          %[two]        \n\t"
-      "pinsrh_2   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[one]        \n\t"
-      "pinsrh_3   %[ftmp1],        %[ftmp1],          %[src0]       \n\t"
-      "pextrh     %[src0],         %[ftmp2],          %[zero]       \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "pinsrh_0   %[src1],         %[src1],           %[src0]       \n\t"
-      "packushb   %[ftmp0],        %[ftmp0],          %[ftmp1]      \n\t"
-      "packushb   %[src1],         %[src1],           %[zero]       \n\t"
-
-      "gssdrc1    %[ftmp0],        0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[ftmp0],        0x07(%[dst_rgb])                 \n\t"
-      "gsswrc1    %[src1],         0x08(%[dst_rgb])                 \n\t"
-      "gsswlc1    %[src1],         0x0b(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x0c          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
-        [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
-        [one] "f"(0x01), [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
-        [eleven] "f"(0x0b)
-      : "memory");
-}
-
-// dither4 is a row of 4 values from 4x4 dither matrix.
-// The 4x4 matrix contains values to increase RGB.  When converting to
-// fewer bits (565) this provides an ordered dither.
-// The order in the 4x4 matrix in first byte is upper left.
-// The 4 values are passed as an int, then referenced as an array, so
-// endian will not affect order of the original matrix.  But the dither4
-// will containing the first pixel in the lower byte for little endian
-// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
-                               uint8_t* dst_rgb,
-                               const uint32_t dither4,
-                               int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[3];
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-
-  __asm__ volatile(
-      "punpcklbh  %[dither],       %[dither],         %[zero]       \n\t"
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-
-      "paddh      %[b],            %[b],              %[dither]     \n\t"
-      "paddh      %[g],            %[g],              %[dither]     \n\t"
-      "paddh      %[r],            %[r],              %[dither]     \n\t"
-      "pcmpgth    %[src0],         %[b],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[b]          \n\t"
-      "and        %[b],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[g],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[g]          \n\t"
-      "and        %[g],            %[src0],           %[c0]         \n\t"
-      "pcmpgth    %[src0],         %[r],              %[c0]         \n\t"
-      "or         %[src0],         %[src0],           %[r]          \n\t"
-      "and        %[r],            %[src0],           %[c0]         \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[two]        \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[eleven]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
-        [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
-      : "memory");
-}
-
-void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[three]      \n\t"
-      "psrlh      %[g],            %[g],              %[three]      \n\t"
-      "psrlh      %[r],            %[r],              %[three]      \n\t"
-      "psrlh      %[a],            %[a],              %[seven]      \n\t"
-
-      "psllh      %[g],            %[g],              %[five]       \n\t"
-      "psllh      %[r],            %[r],              %[ten]        \n\t"
-      "psllh      %[a],            %[a],              %[fifteen]    \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
-        [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
-      : "memory");
-}
-
-void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
-                           uint8_t* dst_rgb,
-                           int width) {
-  uint64_t src0, src1;
-  uint64_t ftmp[4];
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_argb])                \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_argb])                \n\t"
-
-      "punpcklbh  %[b],            %[src0],           %[src1]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[src1]       \n\t"
-      "punpcklbh  %[src0],         %[b],              %[g]          \n\t"
-      "punpckhbh  %[src1],         %[b],              %[g]          \n\t"
-      "punpcklbh  %[b],            %[src0],           %[zero]       \n\t"
-      "punpckhbh  %[g],            %[src0],           %[zero]       \n\t"
-      "punpcklbh  %[r],            %[src1],           %[zero]       \n\t"
-      "punpckhbh  %[a],            %[src1],           %[zero]       \n\t"
-
-      "psrlh      %[b],            %[b],              %[four]       \n\t"
-      "psrlh      %[g],            %[g],              %[four]       \n\t"
-      "psrlh      %[r],            %[r],              %[four]       \n\t"
-      "psrlh      %[a],            %[a],              %[four]       \n\t"
-
-      "psllh      %[g],            %[g],              %[four]       \n\t"
-      "psllh      %[r],            %[r],              %[eight]      \n\t"
-      "psllh      %[a],            %[a],              %[twelve]     \n\t"
-      "or         %[b],            %[b],              %[g]          \n\t"
-      "or         %[b],            %[b],              %[r]          \n\t"
-      "or         %[b],            %[b],              %[a]          \n\t"
-
-      "gssdrc1    %[b],            0x00(%[dst_rgb])                 \n\t"
-      "gssdlc1    %[b],            0x07(%[dst_rgb])                 \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x10          \n\t"
-      "daddiu     %[dst_rgb],      %[dst_rgb],        0x08          \n\t"
-      "daddiu     %[width],        %[width],         -0x04          \n\t"
-      "bgtz       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
-        [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
-      : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
-        [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
-        [twelve] "f"(0x0c)
-      : "memory");
-}
-
-void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ARGBToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0019008100420001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void BGRAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]        \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_0   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsrl       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_0   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void ABGRToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002F00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0042008100190001;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x08(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x10(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x18(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x20          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGBAToUVRow_MMI(const uint8_t* src_rgb,
-                     int src_stride_rgb,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsrl       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_0   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsrl       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001004200810019;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0013002500380002;
-  const uint64_t mask_v = 0x00020038002f0009;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest0, dest1, dest2, dest3;
-  const uint64_t value = 0x1080;
-  const uint64_t mask = 0x0001001900810042;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x00(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src]        \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x0d(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x06(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest1],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src]        \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x13(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x0c(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest2],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[dest2],          %[src]        \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[eight]      \n\t"
-
-      "gsldlc1    %[src],          0x19(%[src_argb])               \n\t"
-      "gsldrc1    %[src],          0x12(%[src_argb])               \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask]       \n\t"
-      "dsll       %[src],          %[src],            %[eight]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[zero]       \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask]       \n\t"
-      "punpcklwd  %[src],          %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[dest3],        %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[dest3],          %[src]        \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[eight]      \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[src_hi],       %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest0],        %[src_lo],         %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                   \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                   \n\t"
-
-      "daddiu     %[src_argb],    %[src_argb],      0x18          \n\t"
-      "daddiu     %[dst_y],        %[dst_y],          0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3)
-      : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
-        [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
-        [zero] "f"(0x00)
-      : "memory");
-}
-
-void RAWToUVRow_MMI(const uint8_t* src_rgb,
-                    int src_stride_rgb,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[13];
-  uint64_t tmp[1];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0002003800250013;
-  const uint64_t mask_v = 0x0009002f00380002;
-
-  __asm__ volatile(
-      "dli        %[tmp0],         0x0001000100010001                   \n\t"
-      "dmtc1      %[tmp0],         %[ftmp12]                            \n\t"
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest0_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest0_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_v],      %[dest0_v],        %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x06(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0d(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x06(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0d(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x0c(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x13(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x0c(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x13(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest1_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest1_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_v],      %[dest1_v],        %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x12(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x19(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x12(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x19(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest2_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest2_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_v],      %[dest2_v],        %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x1e(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x25(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x1e(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x25(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x24(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2b(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x24(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2b(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[dest3_u],      %[src0],           %[value]          \n\t"
-      "dsll       %[dest3_v],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_v],      %[dest3_v],        %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x2a(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x31(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x2a(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x31(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "dsll       %[src0],         %[src0],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src_hi]         \n\t"
-      "punpcklbh  %[src_lo],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_lo]         \n\t"
-      "dsll       %[src1],         %[src1],           %[eight]          \n\t"
-      "punpckhbh  %[src_hi],       %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src0],           %[src_hi]         \n\t"
-      "paddh      %[src0],         %[src0],           %[ftmp12]         \n\t"
-      "psrlh      %[src0],         %[src0],           %[one]            \n\t"
-      "pinsrh_3   %[src_lo],       %[src0],           %[value]          \n\t"
-      "dsll       %[src_hi],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x30              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
-        [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest0, dest1, dest2, dest3;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift = 0x08;
-  const uint64_t value = 0x80;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x0001004D0096001DULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest2],        %[dest2],          %[shift]      \n\t"
-
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[value]      \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]      \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "punpcklwd  %[tmp0],         %[src_lo],         %[src_hi]     \n\t"
-      "punpckhwd  %[tmp1],         %[src_lo],         %[src_hi]     \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest3],        %[dest3],          %[shift]      \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
-                      int src_stride_rgb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  uint64_t src_rgb1;
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0015002a003f0002;
-  const uint64_t mask_v = 0x0002003f0035000a;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "daddu      %[src_rgb1],     %[src_rgb],       %[src_stride_rgb] \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x00(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x07(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest0_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x10(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x17(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest1_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x20(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x27(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x20(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x27(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest2_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x28(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x2f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x28(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x2f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x30(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x37(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x30(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x37(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[dest3_u],      %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src0],           %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-
-      "gsldrc1    %[src0],         0x38(%[src_rgb])                    \n\t"
-      "gsldlc1    %[src0],         0x3f(%[src_rgb])                    \n\t"
-      "gsldrc1    %[src1],         0x38(%[src_rgb1])                    \n\t"
-      "gsldlc1    %[src1],         0x3f(%[src_rgb1])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "punpcklbh  %[src0],         %[src1],           %[zero]           \n\t"
-      "punpckhbh  %[src1],         %[src1],           %[zero]           \n\t"
-      "paddh      %[src0],         %[src_lo],         %[src0]           \n\t"
-      "paddh      %[src1],         %[src_hi],         %[src1]           \n\t"
-      "pavgh      %[src0],         %[src0],           %[src1]           \n\t"
-      "dsll       %[src_lo],       %[src0],           %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src0],           %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_rgb],     %[src_rgb],       0x40              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x10              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
-        [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
-        [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
-        [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
-        [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
-      : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
-        [zero] "f"(0x00), [eight] "f"(0x08),
-        [sixteen] "f"(0x10)
-      : "memory");
-}
-
-void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest0],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest0],       %[dest0],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest1],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest1],       %[dest1],            %[eight]      \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                \n\t"
-      "psrlh      %[src1],        %[src0],             %[eight]      \n\t"
-      "and        %[b],           %[src0],             %[c0]         \n\t"
-      "and        %[src0],        %[src0],             %[c1]         \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]       \n\t"
-      "and        %[g],           %[src1],             %[c2]         \n\t"
-      "psllh      %[g],           %[g],                %[three]      \n\t"
-      "or         %[g],           %[src0],             %[g]          \n\t"
-      "psrlh      %[r],           %[src1],             %[three]      \n\t"
-      "psllh      %[src0],        %[b],                %[three]      \n\t"
-      "psrlh      %[src1],        %[b],                %[two]        \n\t"
-      "or         %[b],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[g],                %[two]        \n\t"
-      "psrlh      %[src1],        %[g],                %[four]       \n\t"
-      "or         %[g],           %[src0],             %[src1]       \n\t"
-      "psllh      %[src0],        %[r],                %[three]      \n\t"
-      "psrlh      %[src1],        %[r],                %[two]        \n\t"
-      "or         %[r],           %[src0],             %[src1]       \n\t"
-      "punpcklhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpcklhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest2],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest2],       %[dest2],            %[eight]      \n\t"
-
-      "punpckhhw  %[src0],        %[b],                %[r]          \n\t"
-      "punpckhhw  %[src1],        %[g],                %[value]      \n\t"
-      "punpcklhw  %[src_lo],      %[src0],             %[src1]       \n\t"
-      "punpckhhw  %[src_hi],      %[src0],             %[src1]       \n\t"
-      "pmaddhw    %[src_lo],      %[src_lo],           %[mask]       \n\t"
-      "pmaddhw    %[src_hi],      %[src_hi],           %[mask]       \n\t"
-      "punpcklwd  %[src0],        %[src_lo],           %[src_hi]     \n\t"
-      "punpckhwd  %[src1],        %[src_lo],           %[src_hi]     \n\t"
-      "paddw      %[dest3],       %[src0],             %[src1]       \n\t"
-      "psrlw      %[dest3],       %[dest3],            %[eight]      \n\t"
-
-      "packsswh   %[src_lo],      %[dest0],            %[dest1]      \n\t"
-      "packsswh   %[src_hi],      %[dest2],            %[dest3]      \n\t"
-      "packushb   %[dest0],       %[src_lo],           %[src_hi]     \n\t"
-      "gssdlc1    %[dest0],       0x07(%[dst_y])                     \n\t"
-      "gssdrc1    %[dest0],       0x00(%[dst_y])                     \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x10          \n\t"
-      "daddiu    %[dst_y],        %[dst_y],            0x08          \n\t"
-      "daddiu    %[width],        %[width],           -0x08          \n\t"
-      "bgtz      %[width],        1b                                 \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
-        [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
-      : "memory");
-}
-
-void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  const uint64_t value = 0x1080108010801080;
-  const uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb1555])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb1555])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[src0],         %[src0],              %[five]     \n\t"
-      "and        %[g],            %[src1],              %[c2]       \n\t"
-      "psllh      %[g],            %[g],                 %[three]    \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "and        %[r],            %[src1],              %[c3]       \n\t"
-      "psrlh      %[r],            %[r],                 %[two]      \n\t"
-      "psllh      %[src0],         %[b],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[b],                 %[two]      \n\t"
-      "or         %[b],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[g],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[g],                 %[two]      \n\t"
-      "or         %[g],            %[src0],              %[src1]     \n\t"
-      "psllh      %[src0],         %[r],                 %[three]    \n\t"
-      "psrlh      %[src1],         %[r],                 %[two]      \n\t"
-      "or         %[r],            %[src0],              %[src1]     \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb1555], %[src_argb1555],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
-        [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
-      : "memory");
-}
-
-void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
-                        uint8_t* dst_y,
-                        int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x1080108010801080;
-  uint64_t mask = 0x0001004200810019;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "1:                                                            \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest0],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest0],        %[dest0],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest1],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest1],        %[dest1],             %[eight]    \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb4444])             \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb4444])             \n\t"
-      "psrlh      %[src1],         %[src0],              %[eight]    \n\t"
-      "and        %[b],            %[src0],              %[c0]       \n\t"
-      "and        %[src0],         %[src0],              %[c1]       \n\t"
-      "psrlh      %[g],            %[src0],              %[four]     \n\t"
-      "and        %[r],            %[src1],              %[c0]       \n\t"
-      "psllh      %[src0],         %[b],                 %[four]     \n\t"
-      "or         %[b],            %[src0],              %[b]        \n\t"
-      "psllh      %[src0],         %[g],                 %[four]     \n\t"
-      "or         %[g],            %[src0],              %[g]        \n\t"
-      "psllh      %[src0],         %[r],                 %[four]     \n\t"
-      "or         %[r],            %[src0],              %[r]        \n\t"
-      "punpcklhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpcklhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest2],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest2],        %[dest2],             %[eight]    \n\t"
-
-      "punpckhhw  %[src0],         %[b],                 %[r]        \n\t"
-      "punpckhhw  %[src1],         %[g],                 %[value]    \n\t"
-      "punpcklhw  %[src_lo],       %[src0],              %[src1]     \n\t"
-      "punpckhhw  %[src_hi],       %[src0],              %[src1]     \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],            %[mask]     \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],            %[mask]     \n\t"
-      "punpcklwd  %[src0],         %[src_lo],            %[src_hi]   \n\t"
-      "punpckhwd  %[src1],         %[src_lo],            %[src_hi]   \n\t"
-      "paddw      %[dest3],        %[src0],              %[src1]     \n\t"
-      "psrlw      %[dest3],        %[dest3],             %[eight]    \n\t"
-
-      "packsswh   %[src_lo],       %[dest0],             %[dest1]    \n\t"
-      "packsswh   %[src_hi],       %[dest2],             %[dest3]    \n\t"
-      "packushb   %[dest0],        %[src_lo],            %[src_hi]   \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_y])                    \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_y])                    \n\t"
-
-      "daddiu     %[src_argb4444], %[src_argb4444],      0x10        \n\t"
-      "daddiu     %[dst_y],        %[dst_y],             0x08        \n\t"
-      "daddiu     %[width],        %[width],            -0x08        \n\t"
-      "bgtz       %[width],        1b                                \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
-        [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
-        [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
-      : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
-        [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
-        [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
-      : "memory");
-}
-
-void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
-                       int src_stride_rgb565,
-                       uint8_t* dst_u,
-                       uint8_t* dst_v,
-                       int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0007000700070007;
-  __asm__ volatile(
-      "daddu      %[next_rgb565], %[src_rgb565],       %[next_rgb565]   \n\t"
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],        0x00(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x07(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x00(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x07(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest0_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest0_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest0_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest0_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest0_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest0_v],     %[dest0_v],          %[three]         \n\t"
-      "or         %[dest0_v],     %[src1],             %[dest0_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest0_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest0_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest0_u],          %[dest0_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest0_u],          %[dest0_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest0_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest0_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest0_u],     %[dest0_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest0_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_u],          %[b0]            \n\t"
-      "psubw      %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest0_u],     %[dest0_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest0_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest0_v],          %[g0]            \n\t"
-      "psubw      %[dest0_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest0_v],     %[dest0_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x08(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x0f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x08(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x0f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest1_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest1_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest1_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest1_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest1_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest1_v],     %[dest1_v],          %[three]         \n\t"
-      "or         %[dest1_v],     %[src1],             %[dest1_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest1_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest1_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest1_u],          %[dest1_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest1_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest1_u],          %[dest1_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest1_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest1_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest1_u],     %[dest1_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest1_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_u],          %[b0]            \n\t"
-      "psubw      %[dest1_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest1_u],     %[dest1_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest1_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest1_v],          %[g0]            \n\t"
-      "psubw      %[dest1_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest1_v],     %[dest1_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x10(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x17(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x10(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x17(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest2_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest2_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest2_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest2_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest2_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest2_v],     %[dest2_v],          %[three]         \n\t"
-      "or         %[dest2_v],     %[src1],             %[dest2_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest2_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest2_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest2_u],          %[dest2_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest2_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest2_u],          %[dest2_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest2_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest2_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest2_u],     %[dest2_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest2_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_u],          %[b0]            \n\t"
-      "psubw      %[dest2_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest2_u],     %[dest2_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest2_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest2_v],          %[g0]            \n\t"
-      "psubw      %[dest2_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest2_v],     %[dest2_v],          %[eight]         \n\t"
-
-      "gsldrc1    %[src0],        0x18(%[src_rgb565])                   \n\t"
-      "gsldlc1    %[src0],        0x1f(%[src_rgb565])                   \n\t"
-      "gsldrc1    %[src1],        0x18(%[next_rgb565])                  \n\t"
-      "gsldlc1    %[src1],        0x1f(%[next_rgb565])                  \n\t"
-      "psrlh      %[dest3_u],     %[src0],             %[eight]         \n\t"
-      "and        %[b0],          %[src0],             %[c0]            \n\t"
-      "and        %[src0],        %[src0],             %[c1]            \n\t"
-      "psrlh      %[src0],        %[src0],             %[five]          \n\t"
-      "and        %[g0],          %[dest3_u],          %[c2]            \n\t"
-      "psllh      %[g0],          %[g0],               %[three]         \n\t"
-      "or         %[g0],          %[src0],             %[g0]            \n\t"
-      "psrlh      %[r0],          %[dest3_u],          %[three]         \n\t"
-      "psrlh      %[src0],        %[src1],             %[eight]         \n\t"
-      "and        %[dest3_u],     %[src1],             %[c0]            \n\t"
-      "and        %[src1],        %[src1],             %[c1]            \n\t"
-      "psrlh      %[src1],        %[src1],             %[five]          \n\t"
-      "and        %[dest3_v],     %[src0],             %[c2]            \n\t"
-      "psllh      %[dest3_v],     %[dest3_v],          %[three]         \n\t"
-      "or         %[dest3_v],     %[src1],             %[dest3_v]       \n\t"
-      "psrlh      %[src0],        %[src0],             %[three]         \n\t"
-      "paddh      %[b0],          %[b0],               %[dest3_u]       \n\t"
-      "paddh      %[g0],          %[g0],               %[dest3_v]       \n\t"
-      "paddh      %[r0],          %[r0],               %[src0]          \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[r0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[r0]            \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[src0],        %[dest3_u],          %[dest3_v]       \n\t"
-      "psrlh      %[b0],          %[src0],             %[six]           \n\t"
-      "psllh      %[r0],          %[src0],             %[one]           \n\t"
-      "or         %[b0],          %[b0],               %[r0]            \n\t"
-      "punpcklhw  %[src0],        %[g0],               %[value]         \n\t"
-      "punpckhhw  %[src1],        %[g0],               %[value]         \n\t"
-      "punpcklwd  %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "punpckhwd  %[dest3_v],     %[src0],             %[src1]          \n\t"
-      "paddh      %[g0],          %[dest3_u],          %[dest3_v]       \n\t"
-      "punpcklhw  %[src0],        %[b0],               %[g0]            \n\t"
-      "punpckhhw  %[src1],        %[b0],               %[g0]            \n\t"
-
-      "pmaddhw    %[dest3_v],     %[src0],             %[mask_v]        \n\t"
-      "pshufh     %[dest3_u],     %[src0],             %[mask]          \n\t"
-      "pmaddhw    %[dest3_u],     %[dest3_u],          %[mask_u]        \n\t"
-      "pmaddhw    %[g0],          %[src1],             %[mask_v]        \n\t"
-      "pshufh     %[b0],          %[src1],             %[mask]          \n\t"
-      "pmaddhw    %[b0],          %[b0],               %[mask_u]        \n\t"
-
-      "punpcklwd  %[src0],        %[dest3_u],          %[b0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_u],          %[b0]            \n\t"
-      "psubw      %[dest3_u],     %[src0],             %[src1]          \n\t"
-      "psraw      %[dest3_u],     %[dest3_u],          %[eight]         \n\t"
-      "punpcklwd  %[src0],        %[dest3_v],          %[g0]            \n\t"
-      "punpckhwd  %[src1],        %[dest3_v],          %[g0]            \n\t"
-      "psubw      %[dest3_v],     %[src1],             %[src0]          \n\t"
-      "psraw      %[dest3_v],     %[dest3_v],          %[eight]         \n\t"
-
-      "packsswh   %[src0],        %[dest0_u],          %[dest1_u]       \n\t"
-      "packsswh   %[src1],        %[dest2_u],          %[dest3_u]       \n\t"
-      "packushb   %[dest0_u],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_u],     0x07(%[dst_u])                        \n\t"
-      "gssdrc1    %[dest0_u],     0x00(%[dst_u])                        \n\t"
-      "packsswh   %[src0],        %[dest0_v],          %[dest1_v]       \n\t"
-      "packsswh   %[src1],        %[dest2_v],          %[dest3_v]       \n\t"
-      "packushb   %[dest0_v],     %[src0],             %[src1]          \n\t"
-      "gssdlc1    %[dest0_v],     0x07(%[dst_v])                        \n\t"
-      "gssdrc1    %[dest0_v],     0x00(%[dst_v])                        \n\t"
-
-      "daddiu    %[src_rgb565],   %[src_rgb565],       0x20             \n\t"
-      "daddiu    %[next_rgb565],  %[next_rgb565],      0x20             \n\t"
-      "daddiu    %[dst_u],        %[dst_u],            0x08             \n\t"
-      "daddiu    %[dst_v],        %[dst_v],            0x08             \n\t"
-      "daddiu    %[width],        %[width],           -0x10             \n\t"
-      "bgtz      %[width],        1b                                    \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
-                         int src_stride_argb1555,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[11];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x001f001f001f001f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t c2 = 0x0003000300030003;
-  uint64_t c3 = 0x007c007c007c007c;
-  __asm__ volatile(
-      "daddu      %[next_argb1555], %[src_argb1555],      %[next_argb1555] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest0_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest0_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest1_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest1_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[dest0_u],       %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[dest1_u],       %[dest0_v],            %[dest1_v]      \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest2_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest2_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest0_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest0_v],       %[dest0_v],            %[three]        \n\t"
-      "or         %[dest0_v],       %[src1],               %[dest0_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb1555])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb1555])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb1555])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb1555])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[five]         \n\t"
-      "and        %[g0],            %[dest3_u],            %[c2]           \n\t"
-      "psllh      %[g0],            %[g0],                 %[three]        \n\t"
-      "or         %[g0],            %[src0],               %[g0]           \n\t"
-      "and        %[r0],            %[dest3_u],            %[c3]           \n\t"
-      "psrlh      %[r0],            %[r0],                 %[two]          \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[src1],          %[src1],               %[five]         \n\t"
-      "and        %[dest1_v],       %[src0],               %[c2]           \n\t"
-      "psllh      %[dest1_v],       %[dest1_v],            %[three]        \n\t"
-      "or         %[dest1_v],       %[src1],               %[dest1_v]      \n\t"
-      "and        %[src0],          %[src0],               %[c3]           \n\t"
-      "psrlh      %[src0],          %[src0],               %[two]          \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[six]          \n\t"
-      "psllh      %[r0],            %[src0],               %[one]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[six]          \n\t"
-      "psllh      %[g0],            %[g0],                 %[one]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[dest0_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src1],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packushb   %[dest0_v],       %[dest1_u],            %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb1555],   %[src_argb1555],       0x20            \n\t"
-      "daddiu    %[next_argb1555],  %[next_argb1555],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10])
-      : [src_argb1555] "r"(src_argb1555),
-        [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
-        [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
-        [two] "f"(0x02), [one] "f"(0x01)
-      : "memory");
-}
-
-void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
-                         int src_stride_argb4444,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  uint64_t ftmp[13];
-  uint64_t value = 0x2020202020202020;
-  uint64_t mask_u = 0x0026004a00700002;
-  uint64_t mask_v = 0x00020070005e0012;
-  uint64_t mask = 0x93;
-  uint64_t c0 = 0x000f000f000f000f;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  __asm__ volatile(
-      "daddu      %[next_argb4444], %[src_argb4444],      %[next_argb4444] \n\t"
-      "1:                                                                  \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x00(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x07(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest0_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest0_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest0_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest0_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest0_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest0_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest0_u],            %[dest0_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest0_u],            %[dest0_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest0_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest0_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest0_u],       %[dest0_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest0_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_u],            %[b0]           \n\t"
-      "psubw      %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest0_u],       %[dest0_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest0_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest0_v],            %[g0]           \n\t"
-      "psubw      %[dest0_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest0_v],       %[dest0_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x08(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x0f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x08(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x0f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest1_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest1_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest1_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest1_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest1_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest1_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest1_u],            %[dest1_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest1_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest1_u],            %[dest1_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest1_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest1_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest1_u],       %[dest1_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest1_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_u],            %[b0]           \n\t"
-      "psubw      %[dest1_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest1_u],       %[dest1_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest1_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest1_v],            %[g0]           \n\t"
-      "psubw      %[dest1_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest1_v],       %[dest1_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x10(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x17(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x10(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x17(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest2_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest2_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest2_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest2_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest2_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest2_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest2_u],            %[dest2_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest2_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest2_u],            %[dest2_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest2_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest2_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest2_u],       %[dest2_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest2_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_u],            %[b0]           \n\t"
-      "psubw      %[dest2_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest2_u],       %[dest2_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest2_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest2_v],            %[g0]           \n\t"
-      "psubw      %[dest2_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest2_v],       %[dest2_v],            %[eight]        \n\t"
-
-      "gsldrc1    %[src0],          0x18(%[src_argb4444])                  \n\t"
-      "gsldlc1    %[src0],          0x1f(%[src_argb4444])                  \n\t"
-      "gsldrc1    %[src1],          0x18(%[next_argb4444])                 \n\t"
-      "gsldlc1    %[src1],          0x1f(%[next_argb4444])                 \n\t"
-      "psrlh      %[dest3_u],       %[src0],               %[eight]        \n\t"
-      "and        %[b0],            %[src0],               %[c0]           \n\t"
-      "and        %[src0],          %[src0],               %[c1]           \n\t"
-      "psrlh      %[g0],            %[src0],               %[four]         \n\t"
-      "and        %[r0],            %[dest3_u],            %[c0]           \n\t"
-      "psrlh      %[src0],          %[src1],               %[eight]        \n\t"
-      "and        %[dest3_u],       %[src1],               %[c0]           \n\t"
-      "and        %[src1],          %[src1],               %[c1]           \n\t"
-      "psrlh      %[dest3_v],       %[src1],               %[four]         \n\t"
-      "and        %[src0],          %[src0],               %[c0]           \n\t"
-      "paddh      %[b0],            %[b0],                 %[dest3_u]      \n\t"
-      "paddh      %[g0],            %[g0],                 %[dest3_v]      \n\t"
-      "paddh      %[r0],            %[r0],                 %[src0]         \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[r0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[r0]           \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[src0],          %[dest3_u],            %[dest3_v]      \n\t"
-      "psrlh      %[b0],            %[src0],               %[four]         \n\t"
-      "psllh      %[r0],            %[src0],               %[two]          \n\t"
-      "or         %[b0],            %[b0],                 %[r0]           \n\t"
-      "psrlh      %[r0],            %[g0],                 %[four]         \n\t"
-      "psllh      %[g0],            %[g0],                 %[two]          \n\t"
-      "or         %[g0],            %[g0],                 %[r0]           \n\t"
-      "punpcklhw  %[src0],          %[g0],                 %[value]        \n\t"
-      "punpckhhw  %[src1],          %[g0],                 %[value]        \n\t"
-      "punpcklwd  %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "punpckhwd  %[dest3_v],       %[src0],               %[src1]         \n\t"
-      "paddh      %[g0],            %[dest3_u],            %[dest3_v]      \n\t"
-      "punpcklhw  %[src0],          %[b0],                 %[g0]           \n\t"
-      "punpckhhw  %[src1],          %[b0],                 %[g0]           \n\t"
-
-      "pmaddhw    %[dest3_v],       %[src0],               %[mask_v]       \n\t"
-      "pshufh     %[dest3_u],       %[src0],               %[mask]         \n\t"
-      "pmaddhw    %[dest3_u],       %[dest3_u],            %[mask_u]       \n\t"
-      "pmaddhw    %[g0],            %[src1],               %[mask_v]       \n\t"
-      "pshufh     %[b0],            %[src1],               %[mask]         \n\t"
-      "pmaddhw    %[b0],            %[b0],                 %[mask_u]       \n\t"
-
-      "punpcklwd  %[src0],          %[dest3_u],            %[b0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_u],            %[b0]           \n\t"
-      "psubw      %[dest3_u],       %[src0],               %[src1]         \n\t"
-      "psraw      %[dest3_u],       %[dest3_u],            %[eight]        \n\t"
-      "punpcklwd  %[src0],          %[dest3_v],            %[g0]           \n\t"
-      "punpckhwd  %[src1],          %[dest3_v],            %[g0]           \n\t"
-      "psubw      %[dest3_v],       %[src1],               %[src0]         \n\t"
-      "psraw      %[dest3_v],       %[dest3_v],            %[eight]        \n\t"
-
-      "packsswh   %[src0],          %[dest0_u],            %[dest1_u]      \n\t"
-      "packsswh   %[src1],          %[dest2_u],            %[dest3_u]      \n\t"
-      "packushb   %[dest0_u],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_u],       0x07(%[dst_u])                         \n\t"
-      "gssdrc1    %[dest0_u],       0x00(%[dst_u])                         \n\t"
-      "packsswh   %[src0],          %[dest0_v],            %[dest1_v]      \n\t"
-      "packsswh   %[src1],          %[dest2_v],            %[dest3_v]      \n\t"
-      "packushb   %[dest0_v],       %[src0],               %[src1]         \n\t"
-      "gssdlc1    %[dest0_v],       0x07(%[dst_v])                         \n\t"
-      "gssdrc1    %[dest0_v],       0x00(%[dst_v])                         \n\t"
-
-      "daddiu    %[src_argb4444],   %[src_argb4444],       0x20            \n\t"
-      "daddiu    %[next_argb4444],  %[next_argb4444],      0x20            \n\t"
-      "daddiu    %[dst_u],          %[dst_u],              0x08            \n\t"
-      "daddiu    %[dst_v],          %[dst_v],              0x08            \n\t"
-      "daddiu    %[width],          %[width],             -0x10            \n\t"
-      "bgtz      %[width],          1b                                     \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
-        [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
-        [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
-        [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
-        [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
-        [dest3_v] "=&f"(ftmp[12])
-      : [src_argb4444] "r"(src_argb4444),
-        [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
-        [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
-        [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
-        [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
-        [two] "f"(0x02)
-      : "memory");
-}
-
-void ARGBToUV444Row_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t ftmp[12];
-  const uint64_t value = 0x4040;
-  const uint64_t mask_u = 0x0026004a00700002;
-  const uint64_t mask_v = 0x00020070005e0012;
-
-  __asm__ volatile(
-      "1:                                                               \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest0_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest0_u],      %[dest0_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest0_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest0_u],      %[dest0_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest0_v],      %[dest0_v],        %[mask_v]         \n\t"
-
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest0_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_u],        %[src_lo]         \n\t"
-      "psubw      %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest0_u],      %[dest0_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest0_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest0_v],        %[src_hi]         \n\t"
-      "psubw      %[dest0_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest0_v],      %[dest0_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x08(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x0f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest1_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest1_u],      %[dest1_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest1_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest1_u],      %[dest1_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest1_v],      %[dest1_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest1_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_u],        %[src_lo]         \n\t"
-      "psubw      %[dest1_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest1_u],      %[dest1_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest1_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest1_v],        %[src_hi]         \n\t"
-      "psubw      %[dest1_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest1_v],      %[dest1_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest2_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest2_u],      %[dest2_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest2_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest2_u],      %[dest2_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest2_v],      %[dest2_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest2_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_u],        %[src_lo]         \n\t"
-      "psubw      %[dest2_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest2_u],      %[dest2_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest2_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest2_v],        %[src_hi]         \n\t"
-      "psubw      %[dest2_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest2_v],      %[dest2_v],        %[eight]          \n\t"
-
-      "gsldrc1    %[src0],         0x18(%[src_argb])                    \n\t"
-      "gsldlc1    %[src0],         0x1f(%[src_argb])                    \n\t"
-      "punpcklbh  %[src_lo],       %[src0],           %[zero]           \n\t"
-      "punpckhbh  %[src_hi],       %[src0],           %[zero]           \n\t"
-      "dsll       %[dest3_u],      %[src_lo],         %[sixteen]        \n\t"
-      "pinsrh_0   %[dest3_u],      %[dest3_u],        %[value]          \n\t"
-      "pinsrh_3   %[dest3_v],      %[src_lo],         %[value]          \n\t"
-      "pmaddhw    %[dest3_u],      %[dest3_u],        %[mask_u]         \n\t"
-      "pmaddhw    %[dest3_v],      %[dest3_v],        %[mask_v]         \n\t"
-      "dsll       %[src_lo],       %[src_hi],         %[sixteen]        \n\t"
-      "pinsrh_0   %[src_lo],       %[src_lo],         %[value]          \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[value]          \n\t"
-      "pmaddhw    %[src_lo],       %[src_lo],         %[mask_u]         \n\t"
-      "pmaddhw    %[src_hi],       %[src_hi],         %[mask_v]         \n\t"
-
-      "punpcklwd  %[src0],         %[dest3_u],        %[src_lo]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_u],        %[src_lo]         \n\t"
-      "psubw      %[dest3_u],      %[src0],           %[src1]           \n\t"
-      "psraw      %[dest3_u],      %[dest3_u],        %[eight]          \n\t"
-      "punpcklwd  %[src0],         %[dest3_v],        %[src_hi]         \n\t"
-      "punpckhwd  %[src1],         %[dest3_v],        %[src_hi]         \n\t"
-      "psubw      %[dest3_v],      %[src1],           %[src0]           \n\t"
-      "psraw      %[dest3_v],      %[dest3_v],        %[eight]          \n\t"
-
-      "packsswh   %[src0],         %[dest0_u],        %[dest1_u]        \n\t"
-      "packsswh   %[src1],         %[dest2_u],        %[dest3_u]        \n\t"
-      "packushb   %[dest0_u],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_u],      0x07(%[dst_u])                       \n\t"
-      "gssdrc1    %[dest0_u],      0x00(%[dst_u])                       \n\t"
-
-      "packsswh   %[src0],         %[dest0_v],        %[dest1_v]        \n\t"
-      "packsswh   %[src1],         %[dest2_v],        %[dest3_v]        \n\t"
-      "packushb   %[dest0_v],      %[src0],           %[src1]           \n\t"
-      "gssdlc1    %[dest0_v],      0x07(%[dst_v])                       \n\t"
-      "gssdrc1    %[dest0_v],      0x00(%[dst_v])                       \n\t"
-
-      "daddiu     %[src_argb],     %[src_argb],       0x20              \n\t"
-      "daddiu     %[dst_u],        %[dst_u],          0x08              \n\t"
-      "daddiu     %[dst_v],        %[dst_v],          0x08              \n\t"
-      "daddi      %[width],        %[width],         -0x08              \n\t"
-      "bgtz       %[width],        1b                                   \n\t"
-      : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
-        [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
-        [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
-        [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
-        [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
-        [dest3_v] "=&f"(ftmp[11])
-      : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
-        [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
-        [eight] "f"(0x08)
-      : "memory");
-}
-
-void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x01;
-  const uint64_t mask2 = 0x0080004D0096001DULL;
-  const uint64_t mask3 = 0xFF000000FF000000ULL;
-  const uint64_t mask4 = ~mask3;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "and        %[src37],        %[src],            %[mask3]      \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_lo],       %[src_lo],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_lo]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_lo]    \n\t"
-      "paddw      %[dest_lo],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest_lo],        %[dest_lo]    \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-      "pinsrh_3   %[src_hi],       %[src_hi],         %[mask1]      \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[mask2]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest_hi],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_hi],        %[dest_hi]    \n\t"
-      "paddw      %[dest_hi],      %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest_hi],        %[dest_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask4]      \n\t"
-      "or         %[dest],         %[dest],           %[src37]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
-        [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
-        [src37] "=&f"(src37)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
-      : "memory");
-}
-
-// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
-  uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
-  uint64_t tmp0, tmp1;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x002300440011ULL;
-  const uint64_t mask2 = 0x002D00580016ULL;
-  const uint64_t mask3 = 0x003200620018ULL;
-  const uint64_t mask4 = 0xFF000000FF000000ULL;
-  const uint64_t shift = 0x07;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[dest37],       %[dest],           %[mask4]      \n\t"
-
-      "punpcklbh  %[dest_lo],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_lo],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_lo],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_lo],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "punpckhbh  %[dest_hi],      %[dest],           %[mask0]      \n\t"
-      "pmaddhw    %[dest0],        %[dest_hi],        %[mask1]      \n\t"
-      "pmaddhw    %[dest1],        %[dest_hi],        %[mask2]      \n\t"
-      "pmaddhw    %[dest2],        %[dest_hi],        %[mask3]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest0],          %[dest1]      \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "punpcklwd  %[tmp0],         %[dest2],          %[mask0]      \n\t"
-      "punpckhwd  %[tmp1],         %[dest2],          %[mask0]      \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "or         %[dest],         %[dest],           %[dest37]     \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
-        [dest] "=&f"(dest)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [shift] "f"(shift)
-      : "memory");
-}
-
-// Apply color matrix to a row of image. Matrix is signed.
-// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
-                            uint8_t* dst_argb,
-                            const int8_t* matrix_argb,
-                            int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
-      dest3;
-  uint64_t matrix, matrix_hi, matrix_lo;
-  uint64_t tmp0, tmp1;
-  const uint64_t shift0 = 0x06;
-  const uint64_t shift1 = 0x08;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest0],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest0],        %[dest0],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_lo],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_lo],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest1],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest1],        %[dest1],          %[shift0]     \n\t"
-
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "gsldlc1    %[matrix],       0x07(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x00(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest2],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest2],        %[dest2],          %[shift0]     \n\t"
-
-      "gsldlc1    %[matrix],       0x0f(%[matrix_ptr])              \n\t"
-      "gsldrc1    %[matrix],       0x08(%[matrix_ptr])              \n\t"
-      "punpcklbh  %[matrix_lo],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "psrah      %[matrix_lo],    %[matrix_lo],      %[shift1]     \n\t"
-      "punpckhbh  %[matrix_hi],    %[matrix],         %[mask0]      \n\t"
-      "psllh      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "psrah      %[matrix_hi],    %[matrix_hi],      %[shift1]     \n\t"
-      "pmaddhw    %[dest_lo],      %[src_hi],         %[matrix_lo]  \n\t"
-      "pmaddhw    %[dest_hi],      %[src_hi],         %[matrix_hi]  \n\t"
-      "punpcklwd  %[tmp0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhwd  %[tmp1],         %[dest_lo],        %[dest_hi]    \n\t"
-      "paddw      %[dest3],        %[tmp0],           %[tmp1]       \n\t"
-      "psraw      %[dest3],        %[dest3],          %[shift0]     \n\t"
-
-      "packsswh   %[tmp0],         %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[tmp1],         %[dest2],          %[dest3]      \n\t"
-      "packushb   %[dest],         %[tmp0],           %[tmp1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
-        [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
-      : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
-      : "memory");
-}
-
-void ARGBShadeRow_MMI(const uint8_t* src_argb,
-                      uint8_t* dst_argb,
-                      int width,
-                      uint32_t value) {
-  uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "punpcklbh  %[value],        %[value],          %[value]      \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src_lo],         %[value]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pmulhuh    %[dest_hi],      %[src_hi],         %[value]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
-        [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
-        [value] "f"(value), [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
-  uint64_t dest, dest_lo, dest_hi;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[src0]       \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[src0]       \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask]       \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask]       \n\t"
-
-      "pmulhuh    %[dest_lo],      %[src0_lo],        %[src1_lo]    \n\t"
-      "pmulhuh    %[dest_hi],      %[src0_hi],        %[src1_hi]    \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-void ARGBAddRow_MMI(const uint8_t* src_argb,
-                    const uint8_t* src_argb1,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "paddusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBSubtractRow_MMI(const uint8_t* src_argb,
-                         const uint8_t* src_argb1,
-                         uint8_t* dst_argb,
-                         int width) {
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "psubusb    %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
-}
-
-// Sobel functions which mimics SSSE3.
-void SobelXRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   const uint8_t* src_y2,
-                   uint8_t* dst_sobelx,
-                   int width) {
-  uint64_t y00 = 0, y10 = 0, y20 = 0;
-  uint64_t y02 = 0, y12 = 0, y22 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                         \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])          \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])          \n\t"  // a_sub=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])          \n\t"  // b=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])          \n\t"  // b_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x07(%[src_y2])          \n\t"  // c=src_y2[i]
-      "gsldrc1   %[y20],        0x00(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x09(%[src_y2])          \n\t"  // c_sub=src_y2[i+2]
-      "gsldrc1   %[y22],        0x02(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"  // a+b
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"  // a+2b+c
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"  // a_sub+b_sub
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"  // c_sub+b_sub
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[sobel],      %[y10],          %[y20]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])          \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])          \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])          \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])          \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])          \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])          \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])          \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])          \n\t"
-
-      "gsldlc1   %[y20],        0x0B(%[src_y2])          \n\t"
-      "gsldrc1   %[y20],        0x04(%[src_y2])          \n\t"
-      "gsldlc1   %[y22],        0x0D(%[src_y2])          \n\t"
-      "gsldrc1   %[y22],        0x06(%[src_y2])          \n\t"
-
-      "punpcklbh %[y00],        %[y00],          %[zero] \n\t"
-      "punpcklbh %[y10],        %[y10],          %[zero] \n\t"
-      "punpcklbh %[y20],        %[y20],          %[zero] \n\t"
-
-      "punpcklbh %[y02],        %[y02],          %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],          %[zero] \n\t"
-      "punpcklbh %[y22],        %[y22],          %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],          %[y10]  \n\t"
-      "paddh     %[y20],        %[y20],          %[y10]  \n\t"
-      "paddh     %[y00],        %[y00],          %[y20]  \n\t"
-
-      "paddh     %[y02],        %[y02],          %[y12]  \n\t"
-      "paddh     %[y22],        %[y22],          %[y12]  \n\t"
-      "paddh     %[y02],        %[y02],          %[y22]  \n\t"
-
-      "pmaxsh    %[y10],        %[y00],          %[y02]  \n\t"
-      "pminsh    %[y20],        %[y00],          %[y02]  \n\t"
-      "psubh     %[y00],        %[y10],          %[y20]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],        %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobelx])         \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobelx])         \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8        \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8        \n\t"
-      "daddiu    %[src_y2],     %[src_y2],      8        \n\t"
-      "daddiu    %[dst_sobelx], %[dst_sobelx],  8        \n\t"
-      "daddiu    %[width],      %[width],      -8        \n\t"
-      "bgtz      %[width],      1b                       \n\t"
-      "nop                                               \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
-        [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
-        [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelYRow_MMI(const uint8_t* src_y0,
-                   const uint8_t* src_y1,
-                   uint8_t* dst_sobely,
-                   int width) {
-  uint64_t y00 = 0, y01 = 0, y02 = 0;
-  uint64_t y10 = 0, y11 = 0, y12 = 0;
-  uint64_t zero = 0x0;
-  uint64_t sobel = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[y00],        0x07(%[src_y0])         \n\t"  // a=src_y0[i]
-      "gsldrc1   %[y00],        0x00(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x08(%[src_y0])         \n\t"  // b=src_y0[i+1]
-      "gsldrc1   %[y01],        0x01(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x09(%[src_y0])         \n\t"  // c=src_y0[i+2]
-      "gsldrc1   %[y02],        0x02(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x07(%[src_y1])         \n\t"  // a_sub=src_y1[i]
-      "gsldrc1   %[y10],        0x00(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x08(%[src_y1])         \n\t"  // b_sub=src_y1[i+1]
-      "gsldrc1   %[y11],        0x01(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x09(%[src_y1])         \n\t"  // c_sub=src_y1[i+2]
-      "gsldrc1   %[y12],        0x02(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"  // a+b
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"  // c+b
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"  // a+2b+c
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"  // a_sub+b_sub
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"  // c_sub+b_sub
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"  // a_sub+2b_sub+c_sub
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[sobel],      %[y02],         %[y12]  \n\t"  // Abs
-
-      "gsldlc1   %[y00],        0x0B(%[src_y0])         \n\t"
-      "gsldrc1   %[y00],        0x04(%[src_y0])         \n\t"
-      "gsldlc1   %[y01],        0x0C(%[src_y0])         \n\t"
-      "gsldrc1   %[y01],        0x05(%[src_y0])         \n\t"
-      "gsldlc1   %[y02],        0x0D(%[src_y0])         \n\t"
-      "gsldrc1   %[y02],        0x06(%[src_y0])         \n\t"
-
-      "gsldlc1   %[y10],        0x0B(%[src_y1])         \n\t"
-      "gsldrc1   %[y10],        0x04(%[src_y1])         \n\t"
-      "gsldlc1   %[y11],        0x0C(%[src_y1])         \n\t"
-      "gsldrc1   %[y11],        0x05(%[src_y1])         \n\t"
-      "gsldlc1   %[y12],        0x0D(%[src_y1])         \n\t"
-      "gsldrc1   %[y12],        0x06(%[src_y1])         \n\t"
-
-      "punpcklbh %[y00],        %[y00],         %[zero] \n\t"
-      "punpcklbh %[y01],        %[y01],         %[zero] \n\t"
-      "punpcklbh %[y02],        %[y02],         %[zero] \n\t"
-
-      "punpcklbh %[y10],        %[y10],         %[zero] \n\t"
-      "punpcklbh %[y11],        %[y11],         %[zero] \n\t"
-      "punpcklbh %[y12],        %[y12],         %[zero] \n\t"
-
-      "paddh     %[y00],        %[y00],         %[y01]  \n\t"
-      "paddh     %[y02],        %[y02],         %[y01]  \n\t"
-      "paddh     %[y00],        %[y00],         %[y02]  \n\t"
-
-      "paddh     %[y10],        %[y10],         %[y11]  \n\t"
-      "paddh     %[y12],        %[y12],         %[y11]  \n\t"
-      "paddh     %[y10],        %[y10],         %[y12]  \n\t"
-
-      "pmaxsh    %[y02],        %[y00],         %[y10]  \n\t"
-      "pminsh    %[y12],        %[y00],         %[y10]  \n\t"
-      "psubh     %[y00],        %[y02],         %[y12]  \n\t"
-
-      "packushb  %[sobel],      %[sobel],       %[y00]  \n\t"  // clamp255
-      "gssdrc1   %[sobel],      0(%[dst_sobely])        \n\t"
-      "gssdlc1   %[sobel],      7(%[dst_sobely])        \n\t"
-
-      "daddiu    %[src_y0],     %[src_y0],      8       \n\t"
-      "daddiu    %[src_y1],     %[src_y1],      8       \n\t"
-      "daddiu    %[dst_sobely], %[dst_sobely],  8       \n\t"
-      "daddiu    %[width],      %[width],      -8       \n\t"
-      "bgtz      %[width],      1b                      \n\t"
-      "nop                                              \n\t"
-      : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
-        [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
-      : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
-        [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
-      : "memory");
-}
-
-void SobelRow_MMI(const uint8_t* src_sobelx,
-                  const uint8_t* src_sobely,
-                  uint8_t* dst_argb,
-                  int width) {
-  double temp[3];
-  uint64_t c1 = 0xff000000ff000000;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[t0],         0x07(%[src_sobelx])       \n\t"  // a=src_sobelx[i]
-      "gsldrc1   %[t0],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[t1],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[t1],         0x00(%[src_sobely])       \n\t"
-      // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
-      "paddusb   %[t2] ,        %[t0],              %[t1] \n\t"
-
-      // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
-      "punpcklbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s1 s1 s1 s55 s0 s0 s0
-      "gssdrc1   %[t1],         0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x07(%[dst_argb])         \n\t"
-
-      // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      // 255 s3 s3 s3 255 s2 s2 s2
-      "gssdrc1   %[t1],         0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x0f(%[dst_argb])         \n\t"
-
-      // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
-      "punpckhbh %[t0],         %[t2],              %[t2] \n\t"
-
-      // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
-      "punpcklbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x17(%[dst_argb])         \n\t"
-
-      // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
-      "punpckhbh %[t1],         %[t0],              %[t0] \n\t"
-      "or        %[t1],         %[t1],              %[c1] \n\t"
-      "gssdrc1   %[t1],         0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[t1],         0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
-                         const uint8_t* src_sobely,
-                         uint8_t* dst_y,
-                         int width) {
-  uint64_t tr = 0;
-  uint64_t tb = 0;
-  __asm__ volatile(
-      "1:	                                       \n\t"
-      "gsldrc1 %[tr],         0x0(%[src_sobelx])       \n\t"
-      "gsldlc1 %[tr],         0x7(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1 %[tb],         0x0(%[src_sobely])       \n\t"
-      "gsldlc1 %[tb],         0x7(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "paddusb %[tr],         %[tr],             %[tb] \n\t"  // g
-      "gssdrc1 %[tr],         0x0(%[dst_y])	       \n\t"
-      "gssdlc1 %[tr],         0x7(%[dst_y])            \n\t"
-
-      "daddiu  %[dst_y],      %[dst_y],          8     \n\t"
-      "daddiu  %[src_sobelx], %[src_sobelx],     8     \n\t"
-      "daddiu  %[src_sobely], %[src_sobely],     8     \n\t"
-      "daddiu  %[width],      %[width],         -8     \n\t"
-      "bgtz    %[width],      1b                       \n\t"
-      "nop                                             \n\t"
-      : [tr] "=&f"(tr), [tb] "=&f"(tb)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_y] "r"(dst_y), [width] "r"(width)
-      : "memory");
-}
-
-void SobelXYRow_MMI(const uint8_t* src_sobelx,
-                    const uint8_t* src_sobely,
-                    uint8_t* dst_argb,
-                    int width) {
-  uint64_t temp[3];
-  uint64_t result = 0;
-  uint64_t gb = 0;
-  uint64_t cr = 0;
-  uint64_t c1 = 0xffffffffffffffff;
-  __asm__ volatile(
-      "1:	                                          \n\t"
-      "gsldlc1   %[tr],         0x07(%[src_sobelx])       \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[tr],         0x00(%[src_sobelx])       \n\t"
-      "gsldlc1   %[tb],         0x07(%[src_sobely])       \n\t"  // b=src_sobely[i]
-      "gsldrc1   %[tb],         0x00(%[src_sobely])       \n\t"
-      "paddusb   %[tg] ,        %[tr],              %[tb] \n\t"  // g
-
-      // g3 b3 g2 b2 g1 b1 g0 b0
-      "punpcklbh %[gb],         %[tb],              %[tg] \n\t"
-      // c3 r3 r2 r2 c1 r1 c0 r0
-      "punpcklbh %[cr],         %[tr],              %[c1] \n\t"
-      // c1 r1 g1 b1 c0 r0 g0 b0
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x00(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x07(%[dst_argb])         \n\t"
-      // c3 r3 g3 b3 c2 r2 g2 b2
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x08(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x0f(%[dst_argb])         \n\t"
-
-      // g7 b7 g6 b6 g5 b5 g4 b4
-      "punpckhbh %[gb],         %[tb],              %[tg] \n\t"
-      // c7 r7 c6 r6 c5 r5 c4 r4
-      "punpckhbh %[cr],         %[tr],              %[c1] \n\t"
-      // c5 r5 g5 b5 c4 r4 g4 b4
-      "punpcklhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x10(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x17(%[dst_argb])         \n\t"
-      // c7 r7 g7 b7 c6 r6 g6 b6
-      "punpckhhw %[result],     %[gb],              %[cr] \n\t"
-      "gssdrc1   %[result],     0x18(%[dst_argb])	  \n\t"
-      "gssdlc1   %[result],     0x1f(%[dst_argb])         \n\t"
-
-      "daddiu    %[dst_argb],   %[dst_argb],        32    \n\t"
-      "daddiu    %[src_sobelx], %[src_sobelx],      8     \n\t"
-      "daddiu    %[src_sobely], %[src_sobely],      8     \n\t"
-      "daddiu    %[width],      %[width],          -8     \n\t"
-      "bgtz      %[width],      1b                        \n\t"
-      "nop                                                \n\t"
-      : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
-        [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
-      : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
-        [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
-      : "memory");
-}
-
-void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  // Copy a Y to RGB.
-  uint64_t src, dest;
-  const uint64_t mask0 = 0x00ffffff00ffffffULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src],          %[src],            %[src]        \n\t"
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "and        %[dest],         %[dest],           %[mask0]      \n\t"
-      "or         %[dest],         %[dest],           %[mask1]      \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-// TODO - respect YuvConstants
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
-                       const struct YuvConstants*, int width) {
-  uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x55;
-  const uint64_t mask2 = 0xAA;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = 0x4A354A354A354A35ULL;
-  const uint64_t mask5 = 0x0488048804880488ULL;
-  const uint64_t shift0 = 0x08;
-  const uint64_t shift1 = 0x06;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_lo],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_lo],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask0]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask1]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-
-      "pshufh     %[src],          %[src_hi],         %[mask2]      \n\t"
-      "psllh      %[dest_lo],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src]        \n\t"
-      "pmulhuh    %[dest_lo],      %[dest_lo],        %[mask4]      \n\t"
-      "psubh      %[dest_lo],      %[dest_lo],        %[mask5]      \n\t"
-      "psrah      %[dest_lo],      %[dest_lo],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_lo],      %[dest_lo],        %[mask3]      \n\t"
-      "pshufh     %[src],          %[src_hi],         %[mask3]      \n\t"
-      "psllh      %[dest_hi],      %[src],            %[shift0]     \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src]        \n\t"
-      "pmulhuh    %[dest_hi],      %[dest_hi],        %[mask4]      \n\t"
-      "psubh      %[dest_hi],      %[dest_hi],        %[mask5]      \n\t"
-      "psrah      %[dest_hi],      %[dest_hi],        %[shift1]     \n\t"
-      "pinsrh_3   %[dest_hi],      %[dest_hi],        %[mask3]      \n\t"
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo)
-      : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
-        [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
-        [shift1] "f"(shift1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, src0, src1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x1b;
-
-  src += width - 1;
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[source],       0(%[src_ptr])                    \n\t"
-      "gsldrc1    %[source],       -7(%[src_ptr])                   \n\t"
-      "punpcklbh  %[src0],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src0],         %[src0],           %[mask1]      \n\t"
-      "punpckhbh  %[src1],         %[source],         %[mask0]      \n\t"
-      "pshufh     %[src1],         %[src1],           %[mask1]      \n\t"
-      "packushb   %[dest],         %[src1],           %[src0]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
-                          uint8_t* dst_u,
-                          uint8_t* dst_v,
-                          int width) {
-  uint64_t src0, src1, dest0, dest1;
-  const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
-  const uint64_t mask1 = 0x1b;
-  const uint64_t shift = 0x08;
-
-  src_uv += (width - 1) << 1;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         1(%[src_ptr])                    \n\t"
-      "gsldrc1    %[src0],         -6(%[src_ptr])                   \n\t"
-      "gsldlc1    %[src1],         -7(%[src_ptr])                   \n\t"
-      "gsldrc1    %[src1],         -14(%[src_ptr])                  \n\t"
-
-      "and        %[dest0],        %[src0],           %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "and        %[dest1],        %[src1],           %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstu_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstu_ptr])                \n\t"
-
-      "psrlh      %[dest0],        %[src0],           %[shift]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "psrlh      %[dest1],        %[src1],           %[shift]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest0],        %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dstv_ptr])                \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dstv_ptr])                \n\t"
-
-      "daddi      %[src_ptr],      %[src_ptr],       -0x10          \n\t"
-      "daddiu     %[dstu_ptr],     %[dstu_ptr],       0x08          \n\t"
-      "daddiu     %[dstv_ptr],     %[dstv_ptr],       0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1)
-      : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
-        [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  src += (width - 1) * 4;
-  uint64_t temp = 0x0;
-  uint64_t shuff = 0x4e;  // 01 00 11 10
-  __asm__ volatile(
-      "1:                                      \n\t"
-      "gsldlc1 %[temp],  3(%[src])     	       \n\t"
-      "gsldrc1 %[temp], -4(%[src])     	       \n\t"
-      "pshufh  %[temp],  %[temp],    %[shuff]  \n\t"
-      "gssdrc1 %[temp],  0x0(%[dst])           \n\t"
-      "gssdlc1 %[temp],  0x7(%[dst])           \n\t"
-
-      "daddiu  %[src],   %[src],    -0x08      \n\t"
-      "daddiu  %[dst],   %[dst],     0x08      \n\t"
-      "daddiu  %[width], %[width],  -0x02      \n\t"
-      "bnez    %[width], 1b                    \n\t"
-      : [temp] "=&f"(temp)
-      : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
-      : "memory");
-}
-
-void SplitUVRow_MMI(const uint8_t* src_uv,
-                    uint8_t* dst_u,
-                    uint8_t* dst_v,
-                    int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                    \n\t"
-      "gsldrc1  %[t0],     0x00(%[src_uv])          \n\t"
-      "gsldlc1  %[t0],     0x07(%[src_uv])          \n\t"
-      "gsldrc1  %[t1],     0x08(%[src_uv])          \n\t"
-      "gsldlc1  %[t1],     0x0f(%[src_uv])          \n\t"
-
-      "and      %[t2],     %[t0],          %[c0]    \n\t"
-      "and      %[t3],     %[t1],          %[c0]    \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_u])	    \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_u])            \n\t"
-
-      "psrlh    %[t2],     %[t0],          %[shift] \n\t"
-      "psrlh    %[t3],     %[t1],          %[shift] \n\t"
-      "packushb %[t2],     %[t2],          %[t3]    \n\t"
-      "gssdrc1  %[t2],     0x0(%[dst_v])            \n\t"
-      "gssdlc1  %[t2],     0x7(%[dst_v])            \n\t"
-
-      "daddiu   %[src_uv], %[src_uv],      16       \n\t"
-      "daddiu   %[dst_u],  %[dst_u],       8        \n\t"
-      "daddiu   %[dst_v],  %[dst_v],       8        \n\t"
-      "daddiu   %[width],  %[width],      -8        \n\t"
-      "bgtz     %[width],  1b                       \n\t"
-      "nop                                          \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [t3] "=&f"(temp[3])
-      : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-void MergeUVRow_MMI(const uint8_t* src_u,
-                    const uint8_t* src_v,
-                    uint8_t* dst_uv,
-                    int width) {
-  uint64_t temp[3];
-  __asm__ volatile(
-      "1:	                                 \n\t"
-      "gsldrc1   %[t0],     0x0(%[src_u])        \n\t"
-      "gsldlc1   %[t0],     0x7(%[src_u])        \n\t"
-      "gsldrc1   %[t1],     0x0(%[src_v])        \n\t"
-      "gsldlc1   %[t1],     0x7(%[src_v])        \n\t"
-      "punpcklbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x0(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0x7(%[dst_uv])       \n\t"
-      "punpckhbh %[t2],     %[t0],         %[t1] \n\t"
-      "gssdrc1   %[t2],     0x8(%[dst_uv])	 \n\t"
-      "gssdlc1   %[t2],     0xf(%[dst_uv])       \n\t"
-
-      "daddiu    %[src_u],  %[src_u],      8     \n\t"
-      "daddiu    %[src_v],  %[src_v],      8     \n\t"
-      "daddiu    %[dst_uv], %[dst_uv],     16    \n\t"
-      "daddiu    %[width],  %[width],     -8     \n\t"
-      "bgtz      %[width],  1b                   \n\t"
-      "nop                                       \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
-      : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [width] "r"(width)
-      : "memory");
-}
-
-void SplitRGBRow_MMI(const uint8_t* src_rgb,
-                     uint8_t* dst_r,
-                     uint8_t* dst_g,
-                     uint8_t* dst_b,
-                     int width) {
-  uint64_t src[4];
-  uint64_t dest_hi, dest_lo, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gslwlc1    %[src0],         0x03(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src1],         0x06(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src1],         0x03(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "gslwlc1    %[src2],         0x09(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src2],         0x06(%[src_ptr])                 \n\t"
-      "gslwlc1    %[src3],         0x0c(%[src_ptr])                 \n\t"
-      "gslwrc1    %[src3],         0x09(%[src_ptr])                 \n\t"
-      "punpcklbh  %[dest_hi],      %[src2],           %[src3]       \n\t"
-
-      "punpcklhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstr_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstr_ptr])                \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstg_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstg_ptr])                \n\t"
-      "punpckhhw  %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gsswlc1    %[dest],         0x03(%[dstb_ptr])                \n\t"
-      "gsswrc1    %[dest],         0x00(%[dstb_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x0c          \n\t"
-      "daddiu     %[dstr_ptr],     %[dstr_ptr],       0x04          \n\t"
-      "daddiu     %[dstg_ptr],     %[dstg_ptr],       0x04          \n\t"
-      "daddiu     %[dstb_ptr],     %[dstb_ptr],       0x04          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
-        [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
-        [dstb_ptr] "r"(dst_b), [width] "r"(width)
-      : "memory");
-}
-
-void MergeRGBRow_MMI(const uint8_t* src_r,
-                     const uint8_t* src_g,
-                     const uint8_t* src_b,
-                     uint8_t* dst_rgb,
-                     int width) {
-  uint64_t srcr, srcg, srcb, dest;
-  uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
-  const uint64_t temp = 0x0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[srcr],         0x07(%[srcr_ptr])                \n\t"
-      "gsldrc1    %[srcr],         0x00(%[srcr_ptr])                \n\t"
-      "gsldlc1    %[srcg],         0x07(%[srcg_ptr])                \n\t"
-      "gsldrc1    %[srcg],         0x00(%[srcg_ptr])                \n\t"
-      "punpcklbh  %[srcrg_lo],     %[srcr],           %[srcg]       \n\t"
-      "punpckhbh  %[srcrg_hi],     %[srcr],           %[srcg]       \n\t"
-
-      "gsldlc1    %[srcb],         0x07(%[srcb_ptr])                \n\t"
-      "gsldrc1    %[srcb],         0x00(%[srcb_ptr])                \n\t"
-      "punpcklbh  %[srcbz_lo],     %[srcb],           %[temp]       \n\t"
-      "punpckhbh  %[srcbz_hi],     %[srcb],           %[temp]       \n\t"
-
-      "punpcklhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x03(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_lo],       %[srcbz_lo]   \n\t"
-      "gsswlc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x06(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x09(%[dst_ptr])                 \n\t"
-      "punpcklhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0c(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest],         %[srcrg_hi],       %[srcbz_hi]   \n\t"
-      "gsswlc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x12(%[dst_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[dest],           %[dest]       \n\t"
-      "gsswlc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "gsswrc1    %[dest],         0x15(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[srcr_ptr],     %[srcr_ptr],       0x08          \n\t"
-      "daddiu     %[srcg_ptr],     %[srcg_ptr],       0x08          \n\t"
-      "daddiu     %[srcb_ptr],     %[srcb_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x18          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
-        [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
-        [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
-        [srcbz_lo] "=&f"(srcbz_lo)
-      : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
-        [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
-      : "memory");
-}
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
-                     int src_stride_yuy2,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                     \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_yuy2])                     \n\t"
-      "daddu    %[src_stride], %[src_yuy2],       %[src_stride_yuy2] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],             %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],             %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],             %[c0]              \n\t"
-      "and      %[t1],         %[t1],             %[c0]              \n\t"
-      "psrlh    %[t0],         %[t0],             %[shift]           \n\t"
-      "psrlh    %[t1],         %[t1],             %[shift]           \n\t"
-      "packushb %[t0],         %[t0],             %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d0],         %[t0],             %[c1]              \n\t"
-      "psrlh    %[d1],         %[t1],             %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                   \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]             \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_yuy2])                     \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_yuy2])                     \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                   \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                   \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]             \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]             \n\t"
-      "and      %[t1],         %[t1],              %[c0]             \n\t"
-      "psrlh    %[t0],         %[t0],              %[shift]          \n\t"
-      "psrlh    %[t1],         %[t1],              %[shift]          \n\t"
-      "packushb %[t0],         %[t0],              %[t1]             \n\t"
-      "mov.s    %[t1],         %[t0]                                 \n\t"
-      "and      %[d2],         %[t0],              %[c1]             \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]          \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]             \n\t"
-      "packushb %[d1],         %[d1],              %[d3]             \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                     \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                         \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                     \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                         \n\t"
-      "daddiu   %[src_yuy2],   %[src_yuy2],        32                \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                 \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                 \n\t"
-      "daddiu   %[width],      %[width],          -16                \n\t"
-      "bgtz     %[width],      1b                                    \n\t"
-      "nop                                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  uint64_t c0 = 0xff00ff00ff00ff00;
-  uint64_t c1 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_yuy2])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_yuy2])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_yuy2])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "psrlh    %[t0],       %[t0],            %[shift] \n\t"
-      "psrlh    %[t1],       %[t1],            %[shift] \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c1]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                     \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_yuy2])       \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_yuy2])       \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_yuy2])       \n\t"
-      "and      %[t0],       %[t0],            %[c0] \n\t"
-      "and      %[t1],       %[t1],            %[c0] \n\t"
-      "packushb %[t0],       %[t0],            %[t1] \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	     \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])           \n\t"
-      "daddiu   %[src_yuy2], %[src_yuy2],      16    \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8     \n\t"
-      "daddiu   %[width],    %[width],        -8     \n\t"
-      "bgtz     %[width],    1b                      \n\t"
-      "nop                                           \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0)
-      : "memory");
-}
-
-// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
-                     int src_stride_uyvy,
-                     uint8_t* dst_u,
-                     uint8_t* dst_v,
-                     int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[3];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  uint64_t src_stride = 0x0;
-  __asm__ volatile(
-      "1:	                                                      \n\t"
-      "gsldrc1  %[t0],         0x00(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x07(%[src_uyvy])                      \n\t"
-      "daddu    %[src_stride], %[src_uyvy],        %[src_stride_uyvy] \n\t"
-      "gsldrc1  %[t1],         0x00(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x07(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x08(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x0f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x08(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x0f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d0],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d1],         %[t1],              %[shift]           \n\t"
-
-      "gsldrc1  %[t0],         0x10(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t0],         0x17(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x10(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x17(%[src_stride])                    \n\t"
-      "pavgb    %[t0],         %[t0],              %[t1]              \n\t"
-
-      "gsldrc1  %[t2],         0x18(%[src_uyvy])                      \n\t"
-      "gsldlc1  %[t2],         0x1f(%[src_uyvy])                      \n\t"
-      "gsldrc1  %[t1],         0x18(%[src_stride])                    \n\t"
-      "gsldlc1  %[t1],         0x1f(%[src_stride])                    \n\t"
-      "pavgb    %[t1],         %[t2],              %[t1]              \n\t"
-
-      "and      %[t0],         %[t0],              %[c0]              \n\t"
-      "and      %[t1],         %[t1],              %[c0]              \n\t"
-      "packushb %[t0],         %[t0],              %[t1]              \n\t"
-      "mov.s    %[t1],         %[t0]                                  \n\t"
-      "and      %[d2],         %[t0],              %[c0]              \n\t"
-      "psrlh    %[d3],         %[t1],              %[shift]           \n\t"
-
-      "packushb %[d0],         %[d0],              %[d2]              \n\t"
-      "packushb %[d1],         %[d1],              %[d3]              \n\t"
-      "gssdrc1  %[d0],         0x0(%[dst_u])	                      \n\t"
-      "gssdlc1  %[d0],         0x7(%[dst_u])                          \n\t"
-      "gssdrc1  %[d1],         0x0(%[dst_v])	                      \n\t"
-      "gssdlc1  %[d1],         0x7(%[dst_v])                          \n\t"
-      "daddiu   %[src_uyvy],   %[src_uyvy],        32                 \n\t"
-      "daddiu   %[dst_u],      %[dst_u],           8                  \n\t"
-      "daddiu   %[dst_v],      %[dst_v],           8                  \n\t"
-      "daddiu   %[width],      %[width],          -16                 \n\t"
-      "bgtz     %[width],      1b                                     \n\t"
-      "nop                                                            \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
-        [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
-        [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
-      : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
-        [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width) {
-  // Output a row of UV values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t temp[2];
-  uint64_t data[4];
-  uint64_t shift = 0x08;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d0],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d1],       %[t1],            %[shift] \n\t"
-
-      "gsldrc1  %[t0],       0x10(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x17(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x18(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x1f(%[src_uyvy])          \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "mov.s    %[t1],       %[t0]                      \n\t"
-      "and      %[d2],       %[t0],            %[c0]    \n\t"
-      "psrlh    %[d3],       %[t1],            %[shift] \n\t"
-
-      "packushb %[d0],       %[d0],            %[d2]    \n\t"
-      "packushb %[d1],       %[d1],            %[d3]    \n\t"
-      "gssdrc1  %[d0],       0x0(%[dst_u])	        \n\t"
-      "gssdlc1  %[d0],       0x7(%[dst_u])              \n\t"
-      "gssdrc1  %[d1],       0x0(%[dst_v])	        \n\t"
-      "gssdlc1  %[d1],       0x7(%[dst_v])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      32       \n\t"
-      "daddiu   %[dst_u],    %[dst_u],         8        \n\t"
-      "daddiu   %[dst_v],    %[dst_v],         8        \n\t"
-      "daddiu   %[width],    %[width],        -16       \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
-        [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
-        [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  // Output a row of Y values.
-  uint64_t c0 = 0x00ff00ff00ff00ff;
-  uint64_t shift = 0x08;
-  uint64_t temp[2];
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldrc1  %[t0],       0x00(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t0],       0x07(%[src_uyvy])          \n\t"
-      "gsldrc1  %[t1],       0x08(%[src_uyvy])          \n\t"
-      "gsldlc1  %[t1],       0x0f(%[src_uyvy])          \n\t"
-      "dsrl     %[t0],       %[t0],            %[shift] \n\t"
-      "dsrl     %[t1],       %[t1],            %[shift] \n\t"
-      "and      %[t0],       %[t0],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "and      %[t1],       %[t1],            %[c0]    \n\t"
-      "packushb %[t0],       %[t0],            %[t1]    \n\t"
-      "gssdrc1  %[t0],       0x0(%[dst_y])	        \n\t"
-      "gssdlc1  %[t0],       0x7(%[dst_y])              \n\t"
-      "daddiu   %[src_uyvy], %[src_uyvy],      16       \n\t"
-      "daddiu   %[dst_y],    %[dst_y],         8        \n\t"
-      "daddiu   %[width],    %[width],        -8        \n\t"
-      "bgtz     %[width],    1b                         \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
-      : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
-        [c0] "f"(c0), [shift] "f"(shift)
-      : "memory");
-}
-
-// Blend src_argb over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb or src_argb1.
-// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb,
-                      const uint8_t* src_argb1,
-                      uint8_t* dst_argb,
-                      int width) {
-  uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
-      dest_lo;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t mask3 = 0xFF;
-  const uint64_t mask4 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_lo]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_lo],      %[src1_lo],        %[alpha]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[src0_lo]    \n\t"
-
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "psubush    %[alpha],        %[mask2],          %[src0_hi]    \n\t"
-      "pshufh     %[alpha],        %[alpha],          %[mask3]      \n\t"
-      "pmullh     %[dest_hi],      %[src1_hi],        %[alpha]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[src0_hi]    \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[mask4]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
-      : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
-        [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void BlendPlaneRow_MMI(const uint8_t* src0,
-                       const uint8_t* src1,
-                       const uint8_t* alpha,
-                       uint8_t* dst,
-                       int width) {
-  uint64_t source0, source1, dest, alph;
-  uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
-      dest_lo;
-  uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
-  const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src0],         0x07(%[src0_ptr])                \n\t"
-      "gsldrc1    %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[src0_lo],      %[src0],           %[mask0]      \n\t"
-      "punpckhbh  %[src0_hi],      %[src0],           %[mask0]      \n\t"
-
-      "gsldlc1    %[src1],         0x07(%[src1_ptr])                \n\t"
-      "gsldrc1    %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src1_lo],      %[src1],           %[mask0]      \n\t"
-      "punpckhbh  %[src1_hi],      %[src1],           %[mask0]      \n\t"
-
-      "gsldlc1    %[alpha],        0x07(%[alpha_ptr])               \n\t"
-      "gsldrc1    %[alpha],        0x00(%[alpha_ptr])               \n\t"
-      "psubusb    %[alpha_r],      %[mask1],          %[alpha]      \n\t"
-      "punpcklbh  %[alpha_lo],     %[alpha],          %[mask0]      \n\t"
-      "punpckhbh  %[alpha_hi],     %[alpha],          %[mask0]      \n\t"
-      "punpcklbh  %[alpha_rlo],    %[alpha_r],        %[mask0]      \n\t"
-      "punpckhbh  %[alpha_rhi],    %[alpha_r],        %[mask0]      \n\t"
-
-      "pmullh     %[dest_lo],      %[src0_lo],        %[alpha_lo]   \n\t"
-      "pmullh     %[dest],         %[src1_lo],        %[alpha_rlo]  \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[dest]       \n\t"
-      "paddush    %[dest_lo],      %[dest_lo],        %[mask2]      \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-
-      "pmullh     %[dest_hi],      %[src0_hi],        %[alpha_hi]   \n\t"
-      "pmullh     %[dest],         %[src1_hi],        %[alpha_rhi]  \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[dest]       \n\t"
-      "paddush    %[dest_hi],      %[dest_hi],        %[mask2]      \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t"
-      "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t"
-      "daddiu     %[alpha_ptr],    %[alpha_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
-        [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
-        [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
-        [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
-        [alpha_r] "=&f"(alpha_rev)
-      : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
-        [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
-                          uint8_t* dst_argb,
-                          int width) {
-  uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
-  const uint64_t mask0 = 0xFF;
-  const uint64_t mask1 = 0xFF000000FF000000ULL;
-  const uint64_t mask2 = ~mask1;
-  const uint64_t shift = 0x08;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[src]        \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[src]        \n\t"
-
-      "pshufh     %[alpha],        %[src_lo],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_lo],      %[alpha],          %[src_lo]     \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],        %[shift]      \n\t"
-      "pshufh     %[alpha],        %[src_hi],         %[mask0]      \n\t"
-      "pmulhuh    %[dest_hi],      %[alpha],          %[src_hi]     \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],        %[shift]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "and        %[dest],         %[dest],           %[mask2]      \n\t"
-      "and        %[src],          %[src],            %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
-        [width] "r"(width)
-      : "memory");
-}
-
-void ComputeCumulativeSumRow_MMI(const uint8_t* row,
-                                 int32_t* cumsum,
-                                 const int32_t* previous_cumsum,
-                                 int width) {
-  int64_t row_sum[2] = {0, 0};
-  uint64_t src, dest0, dest1, presrc0, presrc1, dest;
-  const uint64_t mask = 0x0;
-
-  __asm__ volatile(
-      "xor        %[row_sum0],     %[row_sum0],       %[row_sum0]   \n\t"
-      "xor        %[row_sum1],     %[row_sum1],       %[row_sum1]   \n\t"
-
-      "1:                                                           \n\t"
-      "gslwlc1    %[src],          0x03(%[row_ptr])                 \n\t"
-      "gslwrc1    %[src],          0x00(%[row_ptr])                 \n\t"
-
-      "punpcklbh  %[src],          %[src],            %[mask]       \n\t"
-      "punpcklhw  %[dest0],        %[src],            %[mask]       \n\t"
-      "punpckhhw  %[dest1],        %[src],            %[mask]       \n\t"
-
-      "paddw      %[row_sum0],     %[row_sum0],       %[dest0]      \n\t"
-      "paddw      %[row_sum1],     %[row_sum1],       %[dest1]      \n\t"
-
-      "gsldlc1    %[presrc0],      0x07(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc0],      0x00(%[pre_ptr])                 \n\t"
-      "gsldlc1    %[presrc1],      0x0f(%[pre_ptr])                 \n\t"
-      "gsldrc1    %[presrc1],      0x08(%[pre_ptr])                 \n\t"
-
-      "paddw      %[dest0],        %[row_sum0],       %[presrc0]    \n\t"
-      "paddw      %[dest1],        %[row_sum1],       %[presrc1]    \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[row_ptr],      %[row_ptr],        0x04          \n\t"
-      "daddiu     %[pre_ptr],      %[pre_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x01          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
-        [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
-        [presrc1] "=&f"(presrc1)
-      : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
-        [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
-      : "memory");
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_MMI(uint8_t* dst_ptr,
-                        const uint8_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        int width,
-                        int source_y_fraction) {
-  if (source_y_fraction == 0) {
-    __asm__ volatile(
-        "1:	                              \n\t"
-        "ld     $t0,        0x0(%[src_ptr])   \n\t"
-        "sd     $t0,        0x0(%[dst_ptr])   \n\t"
-        "daddiu %[src_ptr], %[src_ptr],     8 \n\t"
-        "daddiu %[dst_ptr], %[dst_ptr],     8 \n\t"
-        "daddiu %[width],   %[width],      -8 \n\t"
-        "bgtz   %[width],   1b                \n\t"
-        "nop                                  \n\t"
-        :
-        : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
-        : "memory");
-    return;
-  }
-  if (source_y_fraction == 128) {
-    uint64_t uv = 0x0;
-    uint64_t uv_stride = 0x0;
-    __asm__ volatile(
-        "1:	                                            \n\t"
-        "gsldrc1 %[uv],        0x0(%[src_ptr])              \n\t"
-        "gsldlc1 %[uv],        0x7(%[src_ptr])              \n\t"
-        "daddu   $t0,          %[src_ptr],     %[stride]    \n\t"
-        "gsldrc1 %[uv_stride], 0x0($t0)                     \n\t"
-        "gsldlc1 %[uv_stride], 0x7($t0)                     \n\t"
-
-        "pavgb   %[uv],        %[uv],          %[uv_stride] \n\t"
-        "gssdrc1 %[uv],        0x0(%[dst_ptr])              \n\t"
-        "gssdlc1 %[uv],        0x7(%[dst_ptr])              \n\t"
-
-        "daddiu  %[src_ptr],   %[src_ptr],     8            \n\t"
-        "daddiu  %[dst_ptr],   %[dst_ptr],     8            \n\t"
-        "daddiu  %[width],     %[width],      -8            \n\t"
-        "bgtz    %[width],     1b                           \n\t"
-        "nop                                                \n\t"
-        : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
-        : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-          [stride] "r"((int64_t)src_stride)
-        : "memory");
-    return;
-  }
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  uint64_t temp;
-  uint64_t data[4];
-  uint64_t zero = 0x0;
-  uint64_t c0 = 0x0080008000800080;
-  uint64_t fy0 = 0x0100010001000100;
-  uint64_t shift = 0x8;
-  __asm__ volatile(
-      "pshufh    %[fy1],      %[fy1],          %[zero]  \n\t"
-      "psubh     %[fy0],      %[fy0],          %[fy1]   \n\t"
-      "1:	                                        \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr])           \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr])           \n\t"
-      "punpcklbh %[d0],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d1],       %[t0],           %[zero]  \n\t"
-      "gsldrc1   %[t0],       0x0(%[src_ptr1])          \n\t"
-      "gsldlc1   %[t0],       0x7(%[src_ptr1])          \n\t"
-      "punpcklbh %[d2],       %[t0],           %[zero]  \n\t"
-      "punpckhbh %[d3],       %[t0],           %[zero]  \n\t"
-
-      "pmullh    %[d0],       %[d0],           %[fy0]   \n\t"
-      "pmullh    %[d2],       %[d2],           %[fy1]   \n\t"
-      "paddh     %[d0],       %[d0],           %[d2]    \n\t"
-      "paddh     %[d0],       %[d0],           %[c0]    \n\t"
-      "psrlh     %[d0],       %[d0],           %[shift] \n\t"
-
-      "pmullh    %[d1],       %[d1],           %[fy0]   \n\t"
-      "pmullh    %[d3],       %[d3],           %[fy1]   \n\t"
-      "paddh     %[d1],       %[d1],           %[d3]    \n\t"
-      "paddh     %[d1],       %[d1],           %[c0]    \n\t"
-      "psrlh     %[d1],       %[d1],           %[shift] \n\t"
-
-      "packushb  %[d0],       %[d0],           %[d1]    \n\t"
-      "gssdrc1   %[d0],       0x0(%[dst_ptr])           \n\t"
-      "gssdlc1   %[d0],       0x7(%[dst_ptr])           \n\t"
-      "daddiu    %[src_ptr],  %[src_ptr],      8        \n\t"
-      "daddiu    %[src_ptr1], %[src_ptr1],     8        \n\t"
-      "daddiu    %[dst_ptr],  %[dst_ptr],      8        \n\t"
-      "daddiu    %[width],    %[width],       -8        \n\t"
-      "bgtz      %[width],    1b                        \n\t"
-      "nop                                              \n\t"
-      : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
-        [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
-      : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
-        [dst_ptr] "r"(dst_ptr), [width] "r"(width),
-        [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
-        [shift] "f"(shift), [zero] "f"(zero)
-      : "memory");
-}
-
-// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_MMI(const uint8_t* src_argb,
-                        uint8_t* dst_argb,
-                        const uint8_t* shuffler,
-                        int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
-                         ((shuffler[2] & 0x03) << 4) |
-                         ((shuffler[3] & 0x03) << 6);
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest0],        %[dest0],          %[mask1]      \n\t"
-      "punpckhbh  %[dest1],        %[src],            %[mask0]      \n\t"
-      "pshufh     %[dest1],        %[dest1],          %[mask1]      \n\t"
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToYUY2Row_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[ty],             %[vu]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void I422ToUYVYRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* dst_frame,
-                       int width) {
-  uint64_t temp[3];
-  uint64_t vu = 0x0;
-  __asm__ volatile(
-      "1:	                                        \n\t"
-      "gsldlc1   %[ty],        0x7(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gsldrc1   %[ty],        0x0(%[src_y])            \n\t"  // r=src_sobelx[i]
-      "gslwlc1   %[tu],        0x3(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tu],        0x0(%[src_u])            \n\t"  // b=src_sobely[i]
-      "gslwlc1   %[tv],        0x3(%[src_v])            \n\t"  // b=src_sobely[i]
-      "gslwrc1   %[tv],        0x0(%[src_v])            \n\t"  // b=src_sobely[i]
-      "punpcklbh %[vu],        %[tu],             %[tv]	\n\t"  // g
-      "punpcklbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x7(%[dst_frame])        \n\t"
-      "gssdrc1   %[tu],        0x0(%[dst_frame])        \n\t"
-      "punpckhbh %[tu],        %[vu],             %[ty]	\n\t"  // g
-      "gssdlc1   %[tu],        0x0F(%[dst_frame])       \n\t"
-      "gssdrc1   %[tu],        0x08(%[dst_frame])       \n\t"
-      "daddiu    %[src_y],     %[src_y],          8     \n\t"
-      "daddiu    %[src_u],     %[src_u],          4     \n\t"
-      "daddiu    %[src_v],     %[src_v],          4     \n\t"
-      "daddiu    %[dst_frame], %[dst_frame],      16    \n\t"
-      "daddiu    %[width],     %[width],         -8     \n\t"
-      "bgtz      %[width],     1b                       \n\t"
-      "nop                                              \n\t"
-      : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
-        [vu] "=&f"(vu)
-      : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
-        [dst_frame] "r"(dst_frame), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest;
-  const uint64_t mask0 = 0xff000000ff000000ULL;
-  const uint64_t mask1 = ~mask0;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "and        %[src],          %[src],            %[mask0]      \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[src],            %[dest]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
-                             uint8_t* dst_a,
-                             int width) {
-  uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
-  const uint64_t mask = 0xff000000ff000000ULL;
-  const uint64_t shift = 0x18;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x0f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x08(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-
-      "gsldlc1    %[src],          0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x10(%[src_ptr])                 \n\t"
-      "and        %[dest0],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest0],        %[dest0],          %[shift]      \n\t"
-      "gsldlc1    %[src],          0x1f(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x18(%[src_ptr])                 \n\t"
-      "and        %[dest1],        %[src],            %[mask]       \n\t"
-      "psrlw      %[dest1],        %[dest1],          %[shift]      \n\t"
-      "packsswh   %[dest_hi],      %[dest0],          %[dest1]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(width)
-      : "memory");
-}
-
-void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
-  uint64_t source, dest0, dest1, dest;
-  const uint64_t mask0 = 0x0;
-  const uint64_t mask1 = 0x00ffffff00ffffffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "punpckhbh  %[dest0],        %[mask0],          %[src]        \n\t"
-      "punpcklhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x17(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x10(%[dst_ptr])                 \n\t"
-      "punpckhhw  %[dest1],        %[mask0],          %[dest0]      \n\t"
-      "gsldlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gsldrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-      "and        %[dest],         %[dest],           %[mask1]      \n\t"
-      "or         %[dest],         %[dest],           %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x1f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x18(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x20          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
-        [mask1] "f"(mask1), [width] "r"(width)
-      : "memory");
-}
-
-void I444ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-  __asm__ volatile (
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// Also used for 420
-void I422ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
-    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-// 10 bit YUV to ARGB
-void I210ToARGBRow_MMI(const uint16_t* src_y,
-                       const uint16_t* src_u,
-                       const uint16_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "psllh      %[y],            %[y],              %[six]        \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-    "psrah      %[u],            %[u],              %[two]        \n\t"
-    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-    "psrah      %[v],            %[v],              %[two]        \n\t"
-    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
-
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),
-      [u]"=&f"(u),                         [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [alpha]"f"(-1),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask),                     [two]"f"(0x02),
-      [mask1]"f"(0x00ff00ff00ff00ff)
-    : "memory"
-  );
-}
-
-void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
-                            const uint8_t* src_u,
-                            const uint8_t* src_v,
-                            const uint8_t* src_a,
-                            uint8_t* rgb_buf,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
-  uint64_t y,u,v,a;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
-    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),                         [a]"=&f"(a),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
-      [six]"f"(0x6),                       [five]"f"(0x55),
-      [mask]"f"(mask)
-    : "memory"
-  );
-}
-
-void I422ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_u,
-                        const uint8_t* src_v,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y,u,v;
-  uint64_t b_vec[2],g_vec[2],r_vec[2];
-  uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-
-    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
-
-    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-
-    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
-
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
-      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
-      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(mask),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void I422ToARGB4444Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb4444,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask]       \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask]       \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
-      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void I422ToARGB1555Row_MMI(const uint8_t* src_y,
-                           const uint8_t* src_u,
-                           const uint8_t* src_v,
-                           uint8_t* dst_argb1555,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
-
-    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void I422ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_u,
-                         const uint8_t* src_v,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    //u3|u2|u1|u0 --> u1|u1|u0|u0
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    //v3|v2|v1|v0 --> v1|v1|v0|v0
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7),
-      [lmove5]"f"(0x5)
-    : "memory"
-  );
-}
-
-void NV12ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_uv,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV21ToARGBRow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_vu,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_uv,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
-      [one]"f"(0x1),                       [rmove1]"f"(0x8)
-    : "memory"
-  );
-}
-
-void NV21ToRGB24Row_MMI(const uint8_t* src_y,
-                        const uint8_t* src_vu,
-                        uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
-      [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
-      [one]"f"(0x1)
-    : "memory"
-  );
-}
-
-void NV12ToRGB565Row_MMI(const uint8_t* src_y,
-                         const uint8_t* src_uv,
-                         uint8_t* dst_rgb565,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-
-    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
-    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-    "paddb      %[temp],         %[three],          %[eight]      \n\t"
-    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
-      [dst_rgb565]"r"(dst_rgb565),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
-      [eight]"f"(0x8),                     [seven]"f"(0x7)
-    : "memory"
-  );
-}
-
-void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[y],            %[y],              %[temp]       \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-    "and        %[temp],         %[y],              %[temp]       \n\t"
-    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
-
-    "psrlh      %[y],            %[y],              %[eight]      \n\t"
-    "psllh      %[temp],         %[y],              %[eight]      \n\t"
-    "or         %[y],            %[y],              %[temp]       \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
-
-    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
-      [alpha]"f"(-1),                      [eight]"f"(0x8)
-    : "memory"
-  );
-}
-
-void I422ToRGBARow_MMI(const uint8_t* src_y,
-                       const uint8_t* src_u,
-                       const uint8_t* src_v,
-                       uint8_t* rgb_buf,
-                       const struct YuvConstants* yuvconstants,
-                       int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
-
-  __asm__ volatile(
-    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-    "or         %[ub],           %[ub],             %[mask1]      \n\t"
-    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-    "or         %[vr],           %[vr],             %[mask1]      \n\t"
-
-    "1:                                                           \n\t"
-    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-
-    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
-
-    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
-
-    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
-
-    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
-
-    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-
-    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
-
-    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-    "daddi      %[width],        %[width],          -0x04         \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [y]"=&f"(y),                         [u]"=&f"(u),
-      [v]"=&f"(v),
-      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
-      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
-      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
-      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
-      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
-      [br]"=&f"(br),                       [yg]"=&f"(yg)
-    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
-      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
-      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
-      [zero]"f"(0x00),                     [five]"f"(0x55),
-      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
-      [alpha]"f"(-1)
-    : "memory"
-  );
-}
-
-void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile (
-    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-    "1:                                                           \n\t"
-    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
-
-    "daddi      %[width],        %[width],         -0x04          \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-    "bnez       %[width],        1b                               \n\t"
-    : [v32]"+&f"(v32)
-    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
-    : "memory"
-  );
-}
-// clang-format on
-
-// 10 bit YUV to ARGB
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
index b653765cc..4ed136381 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
@@ -1753,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "vdup.32     d7, %2                        \n"  // dither4
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
index ba34ff4c9..74190d611 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
@@ -1979,7 +1979,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
 
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
-                                const uint32_t dither4,
+                                uint32_t dither4,
                                 int width) {
   asm volatile(
       "dup         v1.4s, %w3                    \n"  // dither4
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc
index be4c4a309..27e91a3be 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc
@@ -29,78 +29,76 @@ extern "C" {
 // Fill YUV -> RGB conversion constants into vectors
 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
 // register) is set to round-to-nearest-up mode(0).
-#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \
-                       v_br)                                                   \
-  {                                                                            \
-    asm volatile("csrwi vxrm, 0");                                             \
-    vl = __riscv_vsetvl_e8m1(w);                                               \
-    v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl);                    \
-    v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl);                    \
-    v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl);                    \
-    v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl);                    \
-    v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl);              \
-    v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1] + 32, vl);         \
-    v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2] - 32, vl);         \
-    v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3] + 32, vl);         \
+#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
+  {                                                                  \
+    asm volatile("csrwi vxrm, 0");                                   \
+    ub = yuvconst->kUVCoeff[0];                                      \
+    vr = yuvconst->kUVCoeff[1];                                      \
+    ug = yuvconst->kUVCoeff[2];                                      \
+    vg = yuvconst->kUVCoeff[3];                                      \
+    yg = yuvconst->kRGBCoeffBias[0];                                 \
+    bb = yuvconst->kRGBCoeffBias[1] + 32;                            \
+    bg = yuvconst->kRGBCoeffBias[2] - 32;                            \
+    br = yuvconst->kRGBCoeffBias[3] + 32;                            \
   }
 
 // Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422
 #define READYUV422(vl, v_u, v_v, v_y_16)                \
   {                                                     \
-    vuint8mf2_t v_tmp0, v_tmp1;                         \
-    vuint8m1_t v_y;                                     \
-    vuint16m1_t v_u_16, v_v_16;                         \
-    vl = __riscv_vsetvl_e8mf2((w + 1) / 2);             \
-    v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl);           \
-    v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl);    \
-    v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl);           \
-    v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl);    \
-    v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \
-    v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \
-    v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16);    \
-    v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16);    \
-    vl = __riscv_vsetvl_e8m1(w);                        \
-    v_y = __riscv_vle8_v_u8m1(src_y, vl);               \
-    v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl);       \
+    vuint8m1_t v_tmp0, v_tmp1;                          \
+    vuint8m2_t v_y;                                     \
+    vuint16m2_t v_u_16, v_v_16;                         \
+    vl = __riscv_vsetvl_e8m1((w + 1) / 2);              \
+    v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);            \
+    v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);    \
+    v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);            \
+    v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);    \
+    v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
+    v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
+    v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);    \
+    v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);    \
+    vl = __riscv_vsetvl_e8m2(w);                        \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);               \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);       \
   }
 
 // Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444
 #define READYUV444(vl, v_u, v_v, v_y_16)          \
   {                                               \
-    vuint8m1_t v_y;                               \
-    vl = __riscv_vsetvl_e8m1(w);                  \
-    v_y = __riscv_vle8_v_u8m1(src_y, vl);         \
-    v_u = __riscv_vle8_v_u8m1(src_u, vl);         \
-    v_v = __riscv_vle8_v_u8m1(src_v, vl);         \
-    v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
+    vuint8m2_t v_y;                               \
+    vl = __riscv_vsetvl_e8m2(w);                  \
+    v_y = __riscv_vle8_v_u8m2(src_y, vl);         \
+    v_u = __riscv_vle8_v_u8m2(src_u, vl);         \
+    v_v = __riscv_vle8_v_u8m2(src_v, vl);         \
+    v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
   }
 
 // Convert from YUV to fixed point RGB
-#define YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, \
-                 v_y_16, v_g_16, v_b_16, v_r_16)                               \
+#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
+                 v_b_16, v_r_16)                                               \
   {                                                                            \
-    vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
-    vuint32m4_t v_tmp5;                                                        \
-    v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl);                           \
-    v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl);                        \
-    v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl);                  \
-    v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl);                           \
-    v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl);                        \
-    v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl);                           \
-    v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl);                          \
-    v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl);                        \
-    v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl);                  \
-    v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl);                      \
-    v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl);                        \
-    v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl);                        \
+    vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
+    vuint32m8_t v_tmp5;                                                        \
+    v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl);                             \
+    v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);                        \
+    v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl);                    \
+    v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl);                             \
+    v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl);                          \
+    v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl);                           \
+    v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl);                            \
+    v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl);                        \
+    v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl);                    \
+    v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl);                      \
+    v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl);                          \
+    v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
   }
 
 // Convert from fixed point RGB To 8 bit RGB
 #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
   {                                                          \
-    v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl);            \
-    v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl);            \
-    v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl);            \
+    v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl);            \
+    v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
+    v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
   }
 
 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
@@ -270,20 +268,19 @@ void I444ToARGBRow_RVV(const uint8_t* src_y,
                        int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r, v_a;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
-  v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     READYUV444(vl, v_u, v_v, v_y_16);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
     w -= vl;
     src_y += vl;
     src_u += vl;
@@ -301,20 +298,19 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
                             int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r, v_a;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
     READYUV444(vl, v_u, v_v, v_y_16);
-    v_a = __riscv_vle8_v_u8m1(src_a, vl);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
     w -= vl;
     src_y += vl;
     src_a += vl;
@@ -332,19 +328,18 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y,
                         int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
     READYUV444(vl, v_u, v_v, v_y_16);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
     w -= vl;
     src_y += vl;
     src_u += vl;
@@ -361,20 +356,19 @@ void I422ToARGBRow_RVV(const uint8_t* src_y,
                        int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r, v_a;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
-  v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     READYUV422(vl, v_u, v_v, v_y_16);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
     w -= vl;
     src_y += vl;
     src_u += vl / 2;
@@ -392,20 +386,19 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
                             int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r, v_a;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
     READYUV422(vl, v_u, v_v, v_y_16);
-    v_a = __riscv_vle8_v_u8m1(src_a, vl);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    v_a = __riscv_vle8_v_u8m2(src_a, vl);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
     w -= vl;
     src_y += vl;
     src_a += vl;
@@ -423,20 +416,19 @@ void I422ToRGBARow_RVV(const uint8_t* src_y,
                        int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r, v_a;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
-  v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r, v_a;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
+  v_a = __riscv_vmv_v_x_u8m2(255u, vl);
   do {
     READYUV422(vl, v_u, v_v, v_y_16);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
     w -= vl;
     src_y += vl;
     src_u += vl / 2;
@@ -453,19 +445,18 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y,
                         int width) {
   size_t vl;
   size_t w = (size_t)width;
-  vuint8m1_t v_u, v_v;
-  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
-  vuint8m1_t v_b, v_g, v_r;
-  vuint16m2_t v_yg, v_bb, v_bg, v_br;
-  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
-  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
-                 v_br);
+  uint8_t ub, vr, ug, vg;
+  int16_t yg, bb, bg, br;
+  vuint8m2_t v_u, v_v;
+  vuint8m2_t v_b, v_g, v_r;
+  vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
   do {
     READYUV422(vl, v_u, v_v, v_y_16);
-    YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
-             v_y_16, v_g_16, v_b_16, v_r_16);
+    YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
-    __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
+    __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
     w -= vl;
     src_y += vl;
     src_u += vl / 2;
@@ -528,6 +519,75 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   } while (w > 0);
 }
 
+void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m8(w);
+    vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
+    __riscv_vse8_v_u8m8(dst, v_data, vl);
+    w -= vl;
+    src += vl;
+    dst += vl;
+  } while (w > 0);
+}
+
+// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
+void InterpolateRow_RVV(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  size_t dst_w = (size_t)dst_width;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+  // Blend 100 / 0 - Copy row unchanged.
+  if (y1_fraction == 0) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // Blend 50 / 50.
+  if (y1_fraction == 128) {
+    do {
+      size_t vl = __riscv_vsetvl_e8m8(dst_w);
+      vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
+      vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
+      // Averaging add
+      vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
+      __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
+      dst_w -= vl;
+      src_ptr += vl;
+      src_ptr1 += vl;
+      dst_ptr += vl;
+    } while (dst_w > 0);
+    return;
+  }
+  // General purpose row blend.
+  // To match behavior on other platforms, vxrm (fixed-point rounding mode
+  // register) is set to round-to-nearest-up(0).
+  asm volatile("csrwi vxrm, 0");
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(dst_w);
+    vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
+    vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
+    vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
+    acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
+    __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
+    dst_w -= vl;
+    src_ptr += vl;
+    src_ptr1 += vl;
+    dst_ptr += vl;
+  } while (dst_w > 0);
+}
+
 void SplitRGBRow_RVV(const uint8_t* src_rgb,
                      uint8_t* dst_r,
                      uint8_t* dst_g,
@@ -660,6 +720,42 @@ void MergeXRGBRow_RVV(const uint8_t* src_r,
   } while (w > 0);
 }
 
+void SplitUVRow_RVV(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    vuint8m4_t v_u, v_v;
+    __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
+    __riscv_vse8_v_u8m4(dst_u, v_u, vl);
+    __riscv_vse8_v_u8m4(dst_v, v_v, vl);
+    w -= vl;
+    dst_u += vl;
+    dst_v += vl;
+    src_uv += 2 * vl;
+  } while (w > 0);
+}
+
+void MergeUVRow_RVV(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  size_t w = (size_t)width;
+  do {
+    vuint8m4_t v_u, v_v;
+    size_t vl = __riscv_vsetvl_e8m4(w);
+    v_u = __riscv_vle8_v_u8m4(src_u, vl);
+    v_v = __riscv_vle8_v_u8m4(src_v, vl);
+    __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
+    w -= vl;
+    src_u += vl;
+    src_v += vl;
+    dst_uv += 2 * vl;
+  } while (w > 0);
+}
+
 struct RgbConstants {
   uint8_t kRGBToY[4];
   uint16_t kAddY;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc
deleted file mode 100644
index b414d2d0e..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc
+++ /dev/null
@@ -1,6440 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-// This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
-
-#if defined(_M_ARM64EC)
-#include <intrin.h>
-#elif defined(_M_X64)
-#include <emmintrin.h>
-#include <tmmintrin.h>  // For _mm_maddubs_epi16
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// 64 bit
-#if defined(_M_X64)
-
-// Read 8 UV from 444
-#define READYUV444                                    \
-  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
-  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
-  u_buf += 8;                                         \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
-  y_buf += 8;
-
-// Read 8 UV from 444, With 8 Alpha.
-#define READYUVA444                                   \
-  xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
-  xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
-  u_buf += 8;                                         \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
-  y_buf += 8;                                         \
-  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);            \
-  a_buf += 8;
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                        \
-  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
-  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
-  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
-  u_buf += 4;                                             \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
-  y_buf += 8;
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                       \
-  xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
-  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
-  xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
-  u_buf += 4;                                             \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
-  y_buf += 8;                                             \
-  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
-  a_buf += 8;
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                      \
-  xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80));             \
-  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);   \
-  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
-  xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);  \
-  xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);  \
-  xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);  \
-  xmm0 = _mm_adds_epi16(xmm4, xmm0);                                \
-  xmm1 = _mm_subs_epi16(xmm4, xmm1);                                \
-  xmm2 = _mm_adds_epi16(xmm4, xmm2);                                \
-  xmm0 = _mm_srai_epi16(xmm0, 6);                                   \
-  xmm1 = _mm_srai_epi16(xmm1, 6);                                   \
-  xmm2 = _mm_srai_epi16(xmm2, 6);                                   \
-  xmm0 = _mm_packus_epi16(xmm0, xmm0);                              \
-  xmm1 = _mm_packus_epi16(xmm1, xmm1);                              \
-  xmm2 = _mm_packus_epi16(xmm2, xmm2);
-
-// Store 8 ARGB values.
-#define STOREARGB                                    \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
-  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
-  xmm1 = _mm_loadu_si128(&xmm0);                     \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
-  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
-  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
-  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
-  dst_argb += 32;
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUV422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I444TOARGBROW_SSSE3)
-void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
-                         const uint8_t* u_buf,
-                         const uint8_t* v_buf,
-                         uint8_t* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUV444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
-void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
-                              const uint8_t* u_buf,
-                              const uint8_t* v_buf,
-                              const uint8_t* a_buf,
-                              uint8_t* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
-  while (width > 0) {
-    READYUVA444
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    width -= 8;
-  }
-}
-#endif
-
-// 32 bit
-#else  // defined(_M_X64)
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constants for ARGB.
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
-                              13, 65, 33, 0, 13, 65, 33, 0};
-
-// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
-                               15, 75, 38, 0, 15, 75, 38, 0};
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
-                              112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
-                               127, -84, -43, 0, 127, -84, -43, 0};
-
-static const vec8 kARGBToV = {
-    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
-
-static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-                               -20, -107, 127, 0, -20, -107, 127, 0};
-
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-
-// Constants for BGRA.
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
-                              0, 33, 65, 13, 0, 33, 65, 13};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
-                              0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
-                              0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR.
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
-                              33, 65, 13, 0, 33, 65, 13, 0};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-                              -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
-                              112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
-                              0, 13, 65, 33, 0, 13, 65, 33};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
-                              0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
-                              0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-
-// 8 bit fixed point 0.5, for bias of UV.
-static const ulvec8 kBiasUV128 = {
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB = {
-    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
-
-// Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
-                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
-
-// Shuffle table for converting RAW to RGB24.  First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting RAW to RGB24.  Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
-    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RGB24.
-static const uvec8 kShuffleMaskARGBToRGB24 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGB to RAW.
-static const uvec8 kShuffleMaskARGBToRAW = {
-    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
-
-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
-
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
-                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
-                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
-
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
-                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
-                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
-                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
-                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
-                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
-                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_y
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
-    pslld      xmm5, 24
-
-  convertloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm5
-    por        xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_J400TOARGBROW_AVX2
-// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_y
-    mov         edx, [esp + 8]  // dst_argb
-    mov         ecx, [esp + 12]  // width
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
-    vpslld      ymm5, ymm5, 24
-
-  convertloop:
-    vmovdqu     xmm0, [eax]
-    lea         eax,  [eax + 16]
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpcklbw  ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8
-    vpunpckhwd  ymm1, ymm0, ymm0
-    vpunpcklwd  ymm0, ymm0, ymm0
-    vpor        ymm0, ymm0, ymm5
-    vpor        ymm1, ymm1, ymm5
-    vmovdqu     [edx], ymm0
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_J400TOARGBROW_AVX2
-
-__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_rgb24
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_raw
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
-    pslld     xmm5, 24
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm3, [eax + 32]
-    lea       eax, [eax + 48]
-    movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
-    pshufb    xmm2, xmm4
-    por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
-    pshufb    xmm0, xmm4
-    movdqu    [edx + 32], xmm2
-    por       xmm0, xmm5
-    pshufb    xmm1, xmm4
-    movdqu    [edx], xmm0
-    por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
-    pshufb    xmm3, xmm4
-    movdqu    [edx + 16], xmm1
-    por       xmm3, xmm5
-    movdqu    [edx + 48], xmm3
-    lea       edx, [edx + 64]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
-                                           uint8_t* dst_rgb24,
-                                           int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_raw
-    mov       edx, [esp + 8]  // dst_rgb24
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
-    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
-    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
-
- convertloop:
-    movdqu    xmm0, [eax]
-    movdqu    xmm1, [eax + 4]
-    movdqu    xmm2, [eax + 8]
-    lea       eax, [eax + 24]
-    pshufb    xmm0, xmm3
-    pshufb    xmm1, xmm4
-    pshufb    xmm2, xmm5
-    movq      qword ptr [edx], xmm0
-    movq      qword ptr [edx + 8], xmm1
-    movq      qword ptr [edx + 16], xmm2
-    lea       edx, [edx + 24]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-// 20 instructions.
-__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
-    psllw     xmm4, 10
-    psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]  // src_rgb565
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm3  // R in upper 5 bits
-    psllw     xmm2, 11  // B in upper 5 bits
-    pmulhuw   xmm1, xmm5  // * (256 + 8)
-    pmulhuw   xmm2, xmm5  // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2  // RB
-    pand      xmm0, xmm4  // G in middle 6 bits
-    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
-    por       xmm0, xmm7  // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_RGB565TOARGBROW_AVX2
-// pmul method to replicate bits.
-// Math to replicate bits:
-// (v << 8) | (v << 3)
-// v * 256 + v * 8
-// v * (256 + 8)
-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
-    vpsllw     ymm4, ymm4, 10
-    vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax, [esp + 4]  // src_rgb565
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    sub        edx, eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2  // RB
-    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7  // AG
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_RGB565TOARGBROW_AVX2
-
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
-    vmovd      xmm5, eax
-    vbroadcastss ymm5, xmm5
-    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    vmovd      xmm6, eax
-    vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
-    vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
-    vpsllw     ymm7, ymm7, 8
-
-    mov        eax,  [esp + 4]  // src_argb1555
-    mov        edx,  [esp + 8]  // dst_argb
-    mov        ecx,  [esp + 12]  // width
-    sub        edx,  eax
-    sub        edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
-    vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2  // RB
-    vpsraw     ymm2, ymm0, 8  // A
-    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
-    vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2  // AG
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpckhbw ymm2, ymm1, ymm0
-    vpunpcklbw ymm1, ymm1, ymm0
-    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB1555TOARGBROW_AVX2
-
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    vmovd     xmm4, eax
-    vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]  // src_argb4444
-    mov       edx,  [esp + 8]  // dst_argb
-    mov       ecx,  [esp + 12]  // width
-    sub       edx,  eax
-    sub       edx,  eax
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5  // mask high nibbles
-    vpand      ymm0, ymm0, ymm4  // mask low nibbles
-    vpsrlw     ymm3, ymm2, 4
-    vpsllw     ymm1, ymm0, 4
-    vpor       ymm2, ymm2, ymm3
-    vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpckhbw ymm1, ymm0, ymm2
-    vpunpcklbw ymm0, ymm0, ymm2
-    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
-    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
-    lea       eax, [eax + 32]
-    sub       ecx, 16
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGB4444TOARGBROW_AVX2
-
-// 24 instructions
-__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
-    movd      xmm5, eax
-    pshufd    xmm5, xmm5, 0
-    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    movd      xmm6, eax
-    pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
-    psllw     xmm3, 11
-    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
-    psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
-    psllw     xmm7, 8
-
-    mov       eax, [esp + 4]  // src_argb1555
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    psllw     xmm1, 1  // R in upper 5 bits
-    psllw     xmm2, 11  // B in upper 5 bits
-    pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5  // * (256 + 8)
-    pmulhuw   xmm1, xmm5  // * (256 + 8)
-    psllw     xmm1, 8
-    por       xmm1, xmm2  // RB
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4  // G in middle 5 bits
-    psraw     xmm2, 8  // A
-    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
-    pand      xmm2, xmm7
-    por       xmm0, xmm2  // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-// 18 instructions.
-__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
-    movd      xmm4, eax
-    pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
-    pslld     xmm5, 4
-    mov       eax, [esp + 4]  // src_argb4444
-    mov       edx, [esp + 8]  // dst_argb
-    mov       ecx, [esp + 12]  // width
-    sub       edx, eax
-    sub       edx, eax
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
-    movdqa    xmm2, xmm0
-    pand      xmm0, xmm4  // mask low nibbles
-    pand      xmm2, xmm5  // mask high nibbles
-    movdqa    xmm1, xmm0
-    movdqa    xmm3, xmm2
-    psllw     xmm1, 4
-    psrlw     xmm3, 4
-    por       xmm0, xmm1
-    por       xmm2, xmm3
-    movdqa    xmm1, xmm0
-    punpcklbw xmm0, xmm2
-    punpckhbw xmm1, xmm2
-    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
-    lea       eax, [eax + 16]
-    sub       ecx, 8
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
-    psrldq    xmm1, 4  // 8 bytes from 1
-    pslldq    xmm4, 12  // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
-    por       xmm0, xmm4  // 4 bytes from 1 for 0
-    pslldq    xmm5, 8  // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5  // 8 bytes from 2 for 1
-    psrldq    xmm2, 8  // 4 bytes from 2
-    pslldq    xmm3, 4  // 12 bytes from 3 for 2
-    por       xmm2, xmm3  // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1  // store 1
-    movdqu    [edx + 32], xmm2  // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
-                                          uint8_t* dst_rgb,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
-    movdqu    xmm1, [eax + 16]
-    movdqu    xmm2, [eax + 32]
-    movdqu    xmm3, [eax + 48]
-    lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
-    pshufb    xmm1, xmm6
-    pshufb    xmm2, xmm6
-    pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
-    psrldq    xmm1, 4  // 8 bytes from 1
-    pslldq    xmm4, 12  // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
-    por       xmm0, xmm4  // 4 bytes from 1 for 0
-    pslldq    xmm5, 8  // 8 bytes from 2 for 1
-    movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5  // 8 bytes from 2 for 1
-    psrldq    xmm2, 8  // 4 bytes from 2
-    pslldq    xmm3, 4  // 12 bytes from 3 for 2
-    por       xmm2, xmm3  // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1  // store 1
-    movdqu    [edx + 32], xmm2  // store 2
-    lea       edx, [edx + 48]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    pslld     xmm0, 8  // R
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 5  // G
-    psrad     xmm0, 16  // R
-    pand      xmm1, xmm3  // B
-    pand      xmm2, xmm4  // G
-    pand      xmm0, xmm5  // R
-    por       xmm1, xmm2  // BG
-    por       xmm0, xmm1  // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
-                                                  uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
-                                                  int width) {
-  __asm {
-
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    movd      xmm6, [esp + 12]  // dither4
-    mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6  // make dither 16 bytes
-    movdqa    xmm7, xmm6
-    punpcklwd xmm6, xmm6
-    punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
-    psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
-    psrld     xmm4, 26
-    pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
-    pslld     xmm5, 11
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6  // add dither
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    pslld     xmm0, 8  // R
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 5  // G
-    psrad     xmm0, 16  // R
-    pand      xmm1, xmm3  // B
-    pand      xmm2, xmm4  // G
-    pand      xmm0, xmm5  // R
-    por       xmm1, xmm2  // BG
-    por       xmm0, xmm1  // BGR
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
-                                                  uint8_t* dst_rgb,
-                                                  const uint32_t dither4,
-                                                  int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]  // width
-    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
-    vpermq     ymm6, ymm6, 0xd8
-    vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6  // add dither
-    vpsrld     ymm2, ymm0, 5  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrld     ymm0, ymm0, 8  // R
-    vpand      ymm2, ymm2, ymm4  // G
-    vpand      ymm1, ymm1, ymm3  // B
-    vpand      ymm0, ymm0, ymm5  // R
-    vpor       ymm1, ymm1, ymm2  // BG
-    vpor       ymm0, ymm0, ymm1  // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
-
-// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
-    psrld     xmm4, 27
-    movdqa    xmm5, xmm4  // generate mask 0x000003e0
-    pslld     xmm5, 5
-    movdqa    xmm6, xmm4  // generate mask 0x00007c00
-    pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
-    pslld     xmm7, 15
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0  // B
-    movdqa    xmm2, xmm0  // G
-    movdqa    xmm3, xmm0  // R
-    psrad     xmm0, 16  // A
-    psrld     xmm1, 3  // B
-    psrld     xmm2, 6  // G
-    psrld     xmm3, 9  // R
-    pand      xmm0, xmm7  // A
-    pand      xmm1, xmm4  // B
-    pand      xmm2, xmm5  // G
-    pand      xmm3, xmm6  // R
-    por       xmm0, xmm1  // BA
-    por       xmm2, xmm3  // GR
-    por       xmm0, xmm2  // BGRA
-    packssdw  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src_argb
-    mov       edx, [esp + 8]  // dst_rgb
-    mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
-    psllw     xmm4, 12
-    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
-    psrlw     xmm3, 8
-
- convertloop:
-    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0
-    pand      xmm0, xmm3  // low nibble
-    pand      xmm1, xmm4  // high nibble
-    psrld     xmm0, 4
-    psrld     xmm1, 8
-    por       xmm0, xmm1
-    packuswb  xmm0, xmm0
-    lea       eax, [eax + 16]
-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
-    lea       edx, [edx + 8]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
-                                            uint8_t* dst_rgb,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
-    vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
-    vpsrld     ymm4, ymm4, 26
-    vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrld     ymm0, ymm0, 8  // R
-    vpand      ymm2, ymm2, ymm4  // G
-    vpand      ymm1, ymm1, ymm3  // B
-    vpand      ymm0, ymm0, ymm5  // R
-    vpor       ymm1, ymm1, ymm2  // BG
-    vpor       ymm0, ymm0, ymm1  // BGR
-    vpackusdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTORGB565ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
-    vpslld     ymm7, ymm7, 15
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9  // R
-    vpsrld     ymm2, ymm0, 6  // G
-    vpsrld     ymm1, ymm0, 3  // B
-    vpsrad     ymm0, ymm0, 16  // A
-    vpand      ymm3, ymm3, ymm6  // R
-    vpand      ymm2, ymm2, ymm5  // G
-    vpand      ymm1, ymm1, ymm4  // B
-    vpand      ymm0, ymm0, ymm7  // A
-    vpor       ymm0, ymm0, ymm1  // BA
-    vpor       ymm2, ymm2, ymm3  // GR
-    vpor       ymm0, ymm0, ymm2  // BGRA
-    vpackssdw  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB1555ROW_AVX2
-
-#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_rgb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_rgb
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
-    vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4  // high nibble
-    vpand      ymm0, ymm0, ymm3  // low nibble
-    vpsrld     ymm1, ymm1, 8
-    vpsrld     ymm0, ymm0, 4
-    vpor       ymm0, ymm0, ymm1
-    vpackuswb  ymm0, ymm0, ymm0
-    vpermq     ymm0, ymm0, 0xd8
-    lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
-    lea        edx, [edx + 16]
-    sub        ecx, 8
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOARGB4444ROW_AVX2
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
-                                         uint8_t* dst_y,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    paddw      xmm2, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOYROW_AVX2
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToY
-    vbroadcastf128 ymm5, xmmword ptr kAddY16
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ARGBTOYJROW_AVX2
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
-    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
-    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
-
- convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpmaddubsw ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm1, ymm4
-    vpmaddubsw ymm2, ymm2, ymm4
-    vpmaddubsw ymm3, ymm3, ymm4
-    lea        eax, [eax + 128]
-    vphaddw    ymm0, ymm0, ymm1  // mutates.
-    vphaddw    ymm2, ymm2, ymm3
-    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
-    vpaddw     ymm2, ymm2, ymm5
-    vpsrlw     ymm0, ymm0, 7
-    vpsrlw     ymm2, ymm2, 7
-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_ARGBTOYJROW_AVX2
-
-__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kBGRAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kABGRToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
-                                        uint8_t* dst_y,
-                                        int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_y */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kRGBAToY
-    movdqa     xmm5, xmmword ptr kAddY16
-
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
-                                          int src_stride_argb,
-                                          uint8_t* dst_u,
-                                          uint8_t* dst_v,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToVJ
-    movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
-    paddw      xmm1, xmm5
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
-                                        int src_stride_argb,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx   // stride from u to v
-
- convertloop:
-        /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
-
-        // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0  // U
-    vextractf128 [edx + edi], ymm0, 1  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
-    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
-    sub        edi, edx   // stride from u to v
-
- convertloop:
-        /* step 1 - subsample 32x2 argb pixels to 16x1 */
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    vpavgb     ymm2, ymm2, [eax + esi + 64]
-    vpavgb     ymm3, ymm3, [eax + esi + 96]
-    lea        eax,  [eax + 128]
-    vshufps    ymm4, ymm0, ymm1, 0x88
-    vshufps    ymm0, ymm0, ymm1, 0xdd
-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
-    vshufps    ymm4, ymm2, ymm3, 0x88
-    vshufps    ymm2, ymm2, ymm3, 0xdd
-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 32 different pixels, its 16 pixels of U and 16 of V
-    vpmaddubsw ymm1, ymm0, ymm7  // U
-    vpmaddubsw ymm3, ymm2, ymm7
-    vpmaddubsw ymm0, ymm0, ymm6  // V
-    vpmaddubsw ymm2, ymm2, ymm6
-    vphaddw    ymm1, ymm1, ymm3  // mutates
-    vphaddw    ymm0, ymm0, ymm2
-    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
-    vpaddw     ymm0, ymm0, ymm5
-    vpsraw     ymm1, ymm1, 8
-    vpsraw     ymm0, ymm0, 8
-    vpacksswb  ymm0, ymm1, ymm0  // mutates
-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
-    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
-
-        // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0  // U
-    vextractf128 [edx + edi], ymm0, 1  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBTOUVJROW_AVX2
-
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_u,
-                                            uint8_t* dst_v,
-                                            int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kARGBToV
-    movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx    // stride from u to v
-
- convertloop:
-        /* convert to U and V */
-    movdqu     xmm0, [eax]  // U
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    movdqu     [edx], xmm0
-
-    movdqu     xmm0, [eax]  // V
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm6
-    pmaddubsw  xmm1, xmm6
-    pmaddubsw  xmm2, xmm6
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    lea        eax,  [eax + 64]
-    movdqu     [edx + edi], xmm0
-    lea        edx,  [edx + 16]
-    sub        ecx,  16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kBGRAToV
-    movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kABGRToV
-    movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
-                                         int src_stride_argb,
-                                         uint8_t* dst_u,
-                                         uint8_t* dst_v,
-                                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_argb
-    mov        esi, [esp + 8 + 8]  // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kBiasUV128
-    movdqa     xmm6, xmmword ptr kRGBAToV
-    movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx  // stride from u to v
-
- convertloop:
-         /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-        // step 2 - convert to U and V
-        // from here down is very similar to Y code except
-        // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5  // -> unsigned
-
-        // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0  // U
-    movhps     qword ptr [edx + edi], xmm0  // V
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-// Read 16 UV from 444
-#define READYUV444_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    xmm3, [esi] /* U */                                       \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 16 UV from 444.  With 16 Alpha.
-#define READYUVA444_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    xmm3, [esi] /* U */                                       \
-    __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
-    __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]}
-
-// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
-  __asm {                                                                      \
-    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 \
-  __asm {                                                                      \
-    __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
-    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp] /* A */                                       \
-    __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]}
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    xmm3, [esi] /* UV */                                      \
-    __asm lea        esi,  [esi + 16]                                          \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax] /* Y */                                       \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    ymm4, [eax] /* YUY2 */                                    \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]}
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
-  __asm {                                                                      \
-    __asm vmovdqu    ymm4, [eax] /* UYVY */                                    \
-    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm3, [eax] /* UV */                                      \
-    __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]}
-
-// Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) \
-  __asm {                                                                      \
-    __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
-    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
-    __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
-    __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
-    __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
-    __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
-    __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
-    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
-    __asm vpaddw     ymm4, ymm3, ymm4                                          \
-    __asm vpaddsw    ymm0, ymm0, ymm4                                          \
-    __asm vpsubsw    ymm1, ymm4, ymm1                                          \
-    __asm vpaddsw    ymm2, ymm2, ymm4                                          \
-    __asm vpsraw     ymm0, ymm0, 6                                             \
-    __asm vpsraw     ymm1, ymm1, 6                                             \
-    __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0                                          \
-    __asm vpackuswb  ymm1, ymm1, ymm1                                          \
-    __asm vpackuswb  ymm2, ymm2, ymm2}
-
-// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
-  __asm {                                                                      \
-    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                                 \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                                 \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */                \
-    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */                 \
-    __asm vmovdqu    0[edx], ymm1                                              \
-    __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]}
-
-// Store 16 RGBA values.
-#define STORERGBA_AVX2 \
-  __asm {                                                                      \
-    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                                 \
-    __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                                 \
-    __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */                \
-    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */                 \
-    __asm vmovdqu    [edx], ymm0                                               \
-    __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]}
-
-#ifdef HAS_I422TOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I422ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked) void I422AlphaToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422ALPHATOARGBROW_AVX2
-
-#ifdef HAS_I444TOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I444ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
- convertloop:
-    READYUV444_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I444TOARGBROW_AVX2
-
-#ifdef HAS_I444ALPHATOARGBROW_AVX2
-// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void I444AlphaToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-  push       esi
-  push       edi
-  push       ebx
-  push       ebp
-  mov        eax, [esp + 16 + 4]  // Y
-  mov        esi, [esp + 16 + 8]  // U
-  mov        edi, [esp + 16 + 12]  // V
-  mov        ebp, [esp + 16 + 16]  // A
-  mov        edx, [esp + 16 + 20]  // argb
-  mov        ebx, [esp + 16 + 24]  // yuvconstants
-  mov        ecx, [esp + 16 + 28]  // width
-  sub        edi, esi
-  convertloop:
-  READYUVA444_AVX2
-  YUVTORGB_AVX2(ebx)
-  STOREARGB_AVX2
-
-  sub        ecx, 16
-  jg         convertloop
-
-  pop        ebp
-  pop        ebx
-  pop        edi
-  pop        esi
-  vzeroupper
-  ret
-  }
-}
-#endif  // HAS_I444AlphaTOARGBROW_AVX2
-
-#ifdef HAS_NV12TOARGBROW_AVX2
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void NV12ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* uv_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV12_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV12TOARGBROW_AVX2
-
-#ifdef HAS_NV21TOARGBROW_AVX2
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) void NV21ToARGBRow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* vu_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READNV21_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_NV21TOARGBROW_AVX2
-
-#ifdef HAS_YUY2TOARGBROW_AVX2
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked) void YUY2ToARGBRow_AVX2(
-    const uint8_t* src_yuy2,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // yuy2
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUY2_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOARGBROW_AVX2
-
-#ifdef HAS_UYVYTOARGBROW_AVX2
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked) void UYVYToARGBRow_AVX2(
-    const uint8_t* src_uyvy,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // uyvy
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READUYVY_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_UYVYTOARGBROW_AVX2
-
-#ifdef HAS_I422TORGBAROW_AVX2
-// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked) void I422ToRGBARow_AVX2(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV422_AVX2
-    YUVTORGB_AVX2(ebx)
-    STORERGBA_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I422TORGBAROW_AVX2
-
-#if defined(HAS_I422TOARGBROW_SSSE3)
-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Allows a conversion with half size scaling.
-
-// Read 8 UV from 444.
-#define READYUV444 \
-  __asm {                                                                      \
-    __asm movq       xmm3, qword ptr [esi] /* U */                             \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 UV from 444.  With 8 Alpha.
-#define READYUVA444 \
-  __asm {                                                                      \
-    __asm movq       xmm3, qword ptr [esi] /* U */                             \
-    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
-    __asm lea        ebp, [ebp + 8]}
-
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
-  __asm {                                                                      \
-    __asm movd       xmm3, [esi] /* U */                                       \
-    __asm movd       xmm1, [esi + edi] /* V */                                 \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 \
-  __asm {                                                                      \
-    __asm movd       xmm3, [esi] /* U */                                       \
-    __asm movd       xmm1, [esi + edi] /* V */                                 \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
-    __asm movq       xmm4, qword ptr [eax] /* Y */                             \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp] /* A */                             \
-    __asm lea        ebp, [ebp + 8]}
-
-// Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 \
-  __asm {                                                                      \
-    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 \
-  __asm {                                                                      \
-    __asm movq       xmm3, qword ptr [esi] /* UV */                            \
-    __asm lea        esi,  [esi + 8]                                           \
-    __asm pshufb     xmm3, xmmword ptr kShuffleNV21                            \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]}
-
-// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 \
-  __asm {                                                                      \
-    __asm movdqu     xmm4, [eax] /* YUY2 */                                    \
-    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm3, [eax] /* UV */                                      \
-    __asm pshufb     xmm3, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]}
-
-// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY \
-  __asm {                                                                      \
-    __asm movdqu     xmm4, [eax] /* UYVY */                                    \
-    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm3, [eax] /* UV */                                      \
-    __asm pshufb     xmm3, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]}
-
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) \
-  __asm {                                                                      \
-    __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
-    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
-    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm pmaddubsw  xmm0, xmm3                                                \
-    __asm pmaddubsw  xmm1, xmm3                                                \
-    __asm pmaddubsw  xmm2, xmm3                                                \
-    __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
-    __asm paddw      xmm4, xmm3                                                \
-    __asm paddsw     xmm0, xmm4                                                \
-    __asm paddsw     xmm2, xmm4                                                \
-    __asm psubsw     xmm4, xmm1                                                \
-    __asm movdqa     xmm1, xmm4                                                \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0 /* B */                                        \
-    __asm packuswb   xmm1, xmm1 /* G */                                        \
-    __asm packuswb   xmm2, xmm2 /* R */             \
-  }
-
-// Store 8 ARGB values.
-#define STOREARGB \
-  __asm {                                                                      \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
-    __asm punpcklbw  xmm2, xmm5 /* RA */                                       \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */                      \
-    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */                       \
-    __asm movdqu     0[edx], xmm0                                              \
-    __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 BGRA values.
-#define STOREBGRA \
-  __asm {                                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
-    __asm punpcklbw  xmm1, xmm0 /* GB */                                       \
-    __asm punpcklbw  xmm5, xmm2 /* AR */                                       \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */                      \
-    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */                       \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 RGBA values.
-#define STORERGBA \
-  __asm {                                                                      \
-    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
-    __asm punpcklbw  xmm1, xmm2 /* GR */                                       \
-    __asm punpcklbw  xmm5, xmm0 /* AB */                                       \
-    __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */                      \
-    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */                       \
-    __asm movdqu     0[edx], xmm5                                              \
-    __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]}
-
-// Store 8 RGB24 values.
-#define STORERGB24 \
-  __asm {/* Weave into RRGB */                                                 \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */   \
-    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */           \
-    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */                     \
-    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */       \
-    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */                \
-    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                          \
-    __asm lea        edx,  [edx + 24]}
-
-// Store 8 RGB565 values.
-#define STORERGB565 \
-  __asm {/* Weave into RRGB */                                                 \
-    __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
-    __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
-    __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
-    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */  \
-    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */                \
-    __asm movdqa     xmm2, xmm0 /* G */                                        \
-    __asm pslld      xmm0, 8 /* R */                                           \
-    __asm psrld      xmm3, 3 /* B */                                           \
-    __asm psrld      xmm2, 5 /* G */                                           \
-    __asm psrad      xmm0, 16 /* R */                                          \
-    __asm pand       xmm3, xmm5 /* B */                                        \
-    __asm pand       xmm2, xmm6 /* G */                                        \
-    __asm pand       xmm0, xmm7 /* R */                                        \
-    __asm por        xmm3, xmm2 /* BG */                                       \
-    __asm por        xmm0, xmm3 /* BGR */                                      \
-    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */                 \
-    __asm movdqa     xmm2, xmm1 /* G */                                        \
-    __asm pslld      xmm1, 8 /* R */                                           \
-    __asm psrld      xmm3, 3 /* B */                                           \
-    __asm psrld      xmm2, 5 /* G */                                           \
-    __asm psrad      xmm1, 16 /* R */                                          \
-    __asm pand       xmm3, xmm5 /* B */                                        \
-    __asm pand       xmm2, xmm6 /* G */                                        \
-    __asm pand       xmm1, xmm7 /* R */                                        \
-    __asm por        xmm3, xmm2 /* BG */                                       \
-    __asm por        xmm1, xmm3 /* BGR */                                      \
-    __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */               \
-    __asm lea        edx, [edx + 16]}
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void I444ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV444
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
-__declspec(naked) void I444AlphaToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA444
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked) void I422ToRGB24Row_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_rgb24,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
-    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB24
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked) void I444ToRGB24Row_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_rgb24,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
-    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
-
- convertloop:
-    READYUV444
-    YUVTORGB(ebx)
-    STORERGB24
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked) void I422ToRGB565Row_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* rgb565_buf,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
-    psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
-    psrld      xmm6, 26
-    pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
-    pslld      xmm7, 11
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGB565
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void I422ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked) void I422AlphaToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    const uint8_t* a_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]  // Y
-    mov        esi, [esp + 16 + 8]  // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        ebp, [esp + 16 + 16]  // A
-    mov        edx, [esp + 16 + 20]  // argb
-    mov        ebx, [esp + 16 + 24]  // yuvconstants
-    mov        ecx, [esp + 16 + 28]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUVA422
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void NV12ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* uv_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // UV
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READNV12
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) void NV21ToARGBRow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* vu_buf,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       ebx
-    mov        eax, [esp + 8 + 4]  // Y
-    mov        esi, [esp + 8 + 8]  // VU
-    mov        edx, [esp + 8 + 12]  // argb
-    mov        ebx, [esp + 8 + 16]  // yuvconstants
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READNV21
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked) void YUY2ToARGBRow_SSSE3(
-    const uint8_t* src_yuy2,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // yuy2
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READYUY2
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-// 8 pixels.
-// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked) void UYVYToARGBRow_SSSE3(
-    const uint8_t* src_uyvy,
-    uint8_t* dst_argb,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       ebx
-    mov        eax, [esp + 4 + 4]  // uyvy
-    mov        edx, [esp + 4 + 8]  // argb
-    mov        ebx, [esp + 4 + 12]  // yuvconstants
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
-
- convertloop:
-    READUYVY
-    YUVTORGB(ebx)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    ret
-  }
-}
-
-__declspec(naked) void I422ToRGBARow_SSSE3(
-    const uint8_t* y_buf,
-    const uint8_t* u_buf,
-    const uint8_t* v_buf,
-    uint8_t* dst_rgba,
-    const struct YuvConstants* yuvconstants,
-    int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]  // Y
-    mov        esi, [esp + 12 + 8]  // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-
- convertloop:
-    READYUV422
-    YUVTORGB(ebx)
-    STORERGBA
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_I422TOARGBROW_SSSE3
-
-// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
-#ifdef HAS_I400TOARGBROW_SSE2
-// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
-                                          uint8_t* rgb_buf,
-                                          const struct YuvConstants*,
-                                          int width) {
-  __asm {
-    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
-    movd       xmm2, eax
-    pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
-    movd       xmm3, eax
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    mov        eax, [esp + 4]  // Y
-    mov        edx, [esp + 8]  // rgb
-    mov        ecx, [esp + 12]  // width
-
- convertloop:
-        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, qword ptr [eax]
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0  // Y.Y
-    pmulhuw    xmm0, xmm2
-    psubusw    xmm0, xmm3
-    psrlw      xmm0, 6
-    packuswb   xmm0, xmm0        // G
-
-        // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0  // GG
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
-    por        xmm0, xmm4
-    por        xmm1, xmm4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_SSE2
-
-#ifdef HAS_I400TOARGBROW_AVX2
-// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
-// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
-                                          uint8_t* rgb_buf,
-                                          const struct YuvConstants*,
-                                          int width) {
-  __asm {
-    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
-    vmovd      xmm2, eax
-    vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
-    vmovd      xmm3, eax
-    vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
-    vpslld     ymm4, ymm4, 24
-
-    mov        eax, [esp + 4]  // Y
-    mov        edx, [esp + 8]  // rgb
-    mov        ecx, [esp + 12]  // width
-
- convertloop:
-        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
-    vmovdqu    xmm0, [eax]
-    lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
-    vpmulhuw   ymm0, ymm0, ymm2
-    vpsubusw   ymm0, ymm0, ymm3
-    vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
-
-        // TODO(fbarchard): Weave alpha with unpack.
-        // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
-    vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
-    vpor       ymm0, ymm0, ymm4
-    vpor       ymm1, ymm1, ymm4
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx,  [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I400TOARGBROW_AVX2
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
-
-// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
-                                       uint8_t* dst,
-                                       int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    movdqu    xmm0, [eax - 16 + ecx]
-    pshufb    xmm0, xmm5
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 16
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
-                                      uint8_t* dst,
-                                      int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
-
- convertloop:
-    vmovdqu   ymm0, [eax - 32 + ecx]
-    vpshufb   ymm0, ymm0, ymm5
-    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 32
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORSPLITUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-
-__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
-                                              uint8_t* dst_u,
-                                              uint8_t* dst_v,
-                                              int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]  // src
-    mov       edx, [esp + 4 + 8]  // dst_u
-    mov       edi, [esp + 4 + 12]  // dst_v
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    sub       ecx, 8
-    jg        convertloop
-
-    pop       edi
-    ret
-  }
-}
-#endif  // HAS_MIRRORSPLITUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-
- convertloop:
-    movdqu    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufd    xmm0, xmm0, 0x1b
-    movdqu    [edx], xmm0
-    lea       edx, [edx + 16]
-    sub       ecx, 4
-    jg        convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-
-__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    mov       eax, [esp + 4]  // src
-    mov       edx, [esp + 8]  // dst
-    mov       ecx, [esp + 12]  // width
-    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
-
- convertloop:
-    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
-    vmovdqu   [edx], ymm0
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    jg        convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
-                                       uint8_t* dst_u,
-                                       uint8_t* dst_v,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_uv
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm5  // even bytes
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm2, 8  // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [edx], xmm0
-    movdqu     [edx + edi], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-#endif  // HAS_SPLITUVROW_SSE2
-
-#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
-                                       uint8_t* dst_u,
-                                       uint8_t* dst_v,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_uv
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8  // odd bytes
-    vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5  // even bytes
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1
-    vpackuswb  ymm2, ymm2, ymm3
-    vpermq     ymm0, ymm0, 0xd8
-    vpermq     ymm2, ymm2, 0xd8
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + edi], ymm2
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SPLITUVROW_AVX2
-
-#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
-                                       const uint8_t* src_v,
-                                       uint8_t* dst_uv,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_u
-    mov        edx, [esp + 4 + 8]  // src_v
-    mov        edi, [esp + 4 + 12]  // dst_uv
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        edx, eax
-
-  convertloop:
-    movdqu     xmm0, [eax]  // read 16 U's
-    movdqu     xmm1, [eax + edx]  // and 16 V's
-    lea        eax,  [eax + 16]
-    movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1  // first 8 UV pairs
-    punpckhbw  xmm2, xmm1  // next 8 UV pairs
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_SSE2
-
-#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
-                                       const uint8_t* src_v,
-                                       uint8_t* dst_uv,
-                                       int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_u
-    mov        edx, [esp + 4 + 8]  // src_v
-    mov        edi, [esp + 4 + 12]  // dst_uv
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        edx, eax
-
-  convertloop:
-    vpmovzxbw  ymm0, [eax]
-    vpmovzxbw  ymm1, [eax + edx]
-    lea        eax,  [eax + 16]
-    vpsllw     ymm1, ymm1, 8
-    vpor       ymm2, ymm1, ymm0
-    vmovdqu    [edi], ymm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  //  HAS_MERGEUVROW_AVX2
-
-#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
-                                    uint8_t* dst,
-                                    int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    test       eax, 15
-    jne        convertloopu
-    test       edx, 15
-    jne        convertloopu
-
-  convertloopa:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopa
-    ret
-
-  convertloopu:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloopu
-    ret
-  }
-}
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) void CopyRow_AVX(const uint8_t* src,
-                                   uint8_t* dst,
-                                   int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 64
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_COPYROW_AVX
-
-// Multiple of 1.
-__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
-                                    uint8_t* dst,
-                                    int width) {
-  __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]  // src
-    mov        edi, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    rep movsb
-    mov        edi, edx
-    mov        esi, eax
-    ret
-  }
-}
-
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
-                                             uint8_t* dst,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movdqu     xmm2, [eax]
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
-                                             uint8_t* dst,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
-
-  convertloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + 32]
-    lea        eax, [eax + 64]
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
-                                                uint8_t* dst_a,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_a
-    mov        ecx, [esp + 12]  // width
-
-  extractloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm0, 24
-    psrld      xmm1, 24
-    packssdw   xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         extractloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
-
-#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
-                                                uint8_t* dst_a,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_a
-    mov        ecx, [esp + 12]  // width
-    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
-
-  extractloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpsrld     ymm0, ymm0, 24
-    vpsrld     ymm1, ymm1, 24
-    vmovdqu    ymm2, [eax + 64]
-    vmovdqu    ymm3, [eax + 96]
-    lea        eax, [eax + 128]
-    vpackssdw  ymm0, ymm0, ymm1  // mutates
-    vpsrld     ymm2, ymm2, 24
-    vpsrld     ymm3, ymm3, 24
-    vpackssdw  ymm2, ymm2, ymm3  // mutates
-    vpackuswb  ymm0, ymm0, ymm2  // mutates
-    vpermd     ymm0, ymm4, ymm0  // unmutate
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         extractloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
-                                                uint8_t* dst,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
-    pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
-    psrld      xmm1, 8
-
-  convertloop:
-    movq       xmm2, qword ptr [eax]  // 8 Y's
-    lea        eax, [eax + 8]
-    punpcklbw  xmm2, xmm2
-    punpckhwd  xmm3, xmm2
-    punpcklwd  xmm2, xmm2
-    movdqu     xmm4, [edx]
-    movdqu     xmm5, [edx + 16]
-    pand       xmm2, xmm0
-    pand       xmm3, xmm0
-    pand       xmm4, xmm1
-    pand       xmm5, xmm1
-    por        xmm2, xmm4
-    por        xmm3, xmm5
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
-
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
-                                                uint8_t* dst,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src
-    mov        edx, [esp + 8]  // dst
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
-
-  convertloop:
-    vpmovzxbd  ymm1, qword ptr [eax]
-    vpmovzxbd  ymm2, qword ptr [eax + 8]
-    lea        eax, [eax + 16]
-    vpslld     ymm1, ymm1, 24
-    vpslld     ymm2, ymm2, 24
-    vpblendvb  ymm1, ymm1, [edx], ymm0
-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
-
-#ifdef HAS_SETROW_X86
-// Write 'width' bytes using an 8 bit value repeated.
-// width should be multiple of 4.
-__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
-  __asm {
-    movzx      eax, byte ptr [esp + 8]  // v8
-    mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx  // overwrites edx with upper part of result.
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        ecx, [esp + 12]  // width
-    shr        ecx, 2
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'width' bytes using an 8 bit value repeated.
-__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        eax, [esp + 8]  // v8
-    mov        ecx, [esp + 12]  // width
-    rep stosb
-    mov        edi, edx
-    ret
-  }
-}
-
-// Write 'width' 32 bit values.
-__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
-                                      uint32_t v32,
-                                      int width) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]  // dst
-    mov        eax, [esp + 8]  // v32
-    mov        ecx, [esp + 12]  // width
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-#endif  // HAS_SETROW_X86
-
-#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_yuy2
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // even bytes are Y
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
-                                        int stride_yuy2,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_uyvy
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         convertloop
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
-                                        int stride_uyvy,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    vpavgb     ymm0, ymm0, [eax + esi]
-    vpavgb     ymm1, ymm1, [eax + esi + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
-    vpsrlw     ymm5, ymm5, 8
-    sub        edi, edx
-
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
-    vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1  // mutates.
-    vpermq     ymm0, ymm0, 0xd8
-    vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8  // V
-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
-    vpermq     ymm1, ymm1, 0xd8
-    vpermq     ymm0, ymm0, 0xd8
-    vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0  // V
-    lea        edx, [edx + 16]
-    sub        ecx, 32
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_AVX2
-
-#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_yuy2
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5  // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
-                                        int stride_yuy2,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8  // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8  // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
-                                       uint8_t* dst_y,
-                                       int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_uyvy
-    mov        edx, [esp + 8]  // dst_y
-    mov        ecx, [esp + 12]  // width
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8  // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
-                                        int stride_uyvy,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_yuy2
-    mov        esi, [esp + 8 + 8]  // stride_yuy2
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm5  // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]  // src_yuy2
-    mov        edx, [esp + 4 + 8]  // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5  // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8  // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-#endif  // HAS_YUY2TOYROW_SSE2
-
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
-                                           const uint8_t* src1,
-                                           const uint8_t* alpha,
-                                           uint8_t* dst,
-                                           int width) {
-  __asm {
-    push       esi
-    push       edi
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm6, eax
-    pshufd     xmm6, xmm6, 0x00
-
-    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    movd       xmm7, eax
-    pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]  // src0
-    mov        edx, [esp + 8 + 8]  // src1
-    mov        esi, [esp + 8 + 12]  // alpha
-    mov        edi, [esp + 8 + 16]  // dst
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        eax, esi
-    sub        edx, esi
-    sub        edi, esi
-
-        // 8 pixel loop.
-  convertloop8:
-    movq       xmm0, qword ptr [esi]  // alpha
-    punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5  // a, 255-a
-    movq       xmm1, qword ptr [eax + esi]  // src0
-    movq       xmm2, qword ptr [edx + esi]  // src1
-    punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6  // bias src0/1 - 128
-    pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7  // unbias result - 32768 and round.
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edi + esi], xmm0
-    lea        esi, [esi + 8]
-    sub        ecx, 8
-    jg         convertloop8
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_SSSE3
-
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
-                                          const uint8_t* src1,
-                                          const uint8_t* alpha,
-                                          uint8_t* dst,
-                                          int width) {
-  __asm {
-    push        esi
-    push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
-    vpsllw      ymm5, ymm5, 8
-    mov         eax, 0x80808080  // 128 for biasing image to signed.
-    vmovd       xmm6, eax
-    vbroadcastss ymm6, xmm6
-    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
-    vmovd       xmm7, eax
-    vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]  // src0
-    mov         edx, [esp + 8 + 8]  // src1
-    mov         esi, [esp + 8 + 12]  // alpha
-    mov         edi, [esp + 8 + 16]  // dst
-    mov         ecx, [esp + 8 + 20]  // width
-    sub         eax, esi
-    sub         edx, esi
-    sub         edi, esi
-
-        // 32 pixel loop.
-  convertloop32:
-    vmovdqu     ymm0, [esi]  // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5  // a, 255-a
-    vpxor       ymm0, ymm0, ymm5  // a, 255-a
-    vmovdqu     ymm1, [eax + esi]  // src0
-    vmovdqu     ymm2, [edx + esi]  // src1
-    vpunpckhbw  ymm4, ymm1, ymm2
-    vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
-    vpsrlw      ymm3, ymm3, 8
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm3
-    vmovdqu     [edi + esi], ymm0
-    lea         esi, [esi + 32]
-    sub         ecx, 32
-    jg          convertloop32
-
-    pop         edi
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_BLENDPLANEROW_AVX2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
-                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
-
-// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
-                                          const uint8_t* src_argb1,
-                                          uint8_t* dst_argb,
-                                          int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
-    pslld      xmm4, 24
-    sub        ecx, 4
-    jl         convertloop4b  // less than 4 pixels?
-
-        // 4 pixel loop.
-  convertloop4:
-    movdqu     xmm3, [eax]  // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3  // src argb
-    pxor       xmm3, xmm4  // ~alpha
-    movdqu     xmm2, [esi]  // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
-    pand       xmm2, xmm6  // _r_b
-    paddw      xmm3, xmm7  // 256 - alpha
-    pmullw     xmm2, xmm3  // _r_b * alpha
-    movdqu     xmm1, [esi]  // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8  // _a_g
-    por        xmm0, xmm4  // set alpha to 255
-    pmullw     xmm1, xmm3  // _a_g * alpha
-    psrlw      xmm2, 8  // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2  // + src argb
-    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1  // + src argb
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
-  convertloop4b:
-    add        ecx, 4 - 1
-    jl         convertloop1b
-
-            // 1 pixel loop.
-  convertloop1:
-    movd       xmm3, [eax]  // src argb
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3  // src argb
-    pxor       xmm3, xmm4  // ~alpha
-    movd       xmm2, [esi]  // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
-    pand       xmm2, xmm6  // _r_b
-    paddw      xmm3, xmm7  // 256 - alpha
-    pmullw     xmm2, xmm3  // _r_b * alpha
-    movd       xmm1, [esi]  // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8  // _a_g
-    por        xmm0, xmm4  // set alpha to 255
-    pmullw     xmm1, xmm3  // _a_g * alpha
-    psrlw      xmm2, 8  // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2  // + src argb
-    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1  // + src argb
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
-  convertloop1b:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha0 = {
-    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
-};
-static const uvec8 kShuffleAlpha1 = {
-    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
-};
-__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
-    pslld      xmm3, 24
-    movdqa     xmm4, xmmword ptr kShuffleAlpha0
-    movdqa     xmm5, xmmword ptr kShuffleAlpha1
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    pshufb     xmm0, xmm4  // isolate first 2 alphas
-    movdqu     xmm1, [eax]  // read 4 pixels
-    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1  // rgb * a
-    movdqu     xmm1, [eax]  // read 4 pixels
-    pshufb     xmm1, xmm5  // isolate next 2 alphas
-    movdqu     xmm2, [eax]  // read 4 pixels
-    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2  // rgb * a
-    movdqu     xmm2, [eax]  // mask original alpha
-    lea        eax, [eax + 16]
-    pand       xmm2, xmm3
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm2  // copy original alpha
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_SSSE3
-
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
-                                         128u, 128u, 14u,  15u, 14u, 15u,
-                                         14u,  15u,  128u, 128u};
-__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
-                                             uint8_t* dst_argb,
-                                             int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
-    vpslld     ymm5, ymm5, 24
-
- convertloop:
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
-    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
-    vpand      ymm6, ymm6, ymm5  // isolate alpha
-    vpsrlw     ymm0, ymm0, 8
-    vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vpor       ymm0, ymm0, ymm6  // copy original alpha
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-// Unattenuate 4 pixels at a time.
-__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        edx, [esp + 12 + 8]  // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    lea        ebx, fixed_invtbl8
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    movzx      esi, byte ptr [eax + 3]  // first alpha
-    movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0  // first 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2  // rgb * a
-
-    movdqu     xmm1, [eax]  // read 4 pixels
-    movzx      esi, byte ptr [eax + 11]  // third alpha
-    movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1  // next 2
-    movd       xmm2, dword ptr [ebx + esi * 4]
-    movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
-    movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2  // rgb * a
-    lea        eax, [eax + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
-
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-// Shuffle table duplicating alpha.
-static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
-// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
-// USE_GATHER is not on by default, due to being a slow instruction.
-#ifdef USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    sub        edx, eax
-    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    vzeroupper
-    ret
-  }
-}
-#else   // USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
-                                               uint8_t* dst_argb,
-                                               int width) {
-  __asm {
-
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        edx, [esp + 12 + 8]  // dst_argb
-    mov        ecx, [esp + 12 + 12]  // width
-    sub        edx, eax
-    lea        ebx, fixed_invtbl8
-    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
-
- convertloop:
-        // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]  // alpha0
-    movzx      edi, byte ptr [eax + 7]  // alpha1
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]  // alpha2
-    movzx      edi, byte ptr [eax + 15]  // alpha3
-    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]  // alpha4
-    movzx      edi, byte ptr [eax + 23]  // alpha5
-    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
-    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
-    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]  // alpha6
-    movzx      edi, byte ptr [eax + 31]  // alpha7
-    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
-    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
-    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
-    // end of VPGATHER
-
-    vmovdqu    ymm6, [eax]  // read 8 pixels.
-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
-    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
-    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    vzeroupper
-    ret
-  }
-}
-#endif  // USE_GATHER
-#endif  // HAS_ARGBATTENUATEROW_AVX2
-
-#ifdef HAS_ARGBGRAYROW_SSSE3
-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
-                                         uint8_t* dst_argb,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* width */
-    movdqa     xmm4, xmmword ptr kARGBToYJ
-    movdqa     xmm5, xmmword ptr kAddYJ64
-
- convertloop:
-    movdqu     xmm0, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5  // Add .5 for rounding.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // 8 G bytes
-    movdqu     xmm2, [eax]  // A
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm2, 24
-    psrld      xmm3, 24
-    packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2  // 8 A bytes
-    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0  // 8 GG words
-    punpcklbw  xmm3, xmm2  // 8 GA words
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3  // GGGA first 4
-    punpckhwd  xmm1, xmm3  // GGGA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBGRAYROW_SSSE3
-
-#ifdef HAS_ARGBSEPIAROW_SSSE3
-//    b = (r * 35 + g * 68 + b * 17) >> 7
-//    g = (r * 45 + g * 88 + b * 22) >> 7
-//    r = (r * 50 + g * 98 + b * 24) >> 7
-// Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
-                                   17, 68, 35, 0, 17, 68, 35, 0};
-
-static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
-                                   22, 88, 45, 0, 22, 88, 45, 0};
-
-static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
-                                   24, 98, 50, 0, 24, 98, 50, 0};
-
-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4] /* dst_argb */
-    mov        ecx, [esp + 8] /* width */
-    movdqa     xmm2, xmmword ptr kARGBToSepiaB
-    movdqa     xmm3, xmmword ptr kARGBToSepiaG
-    movdqa     xmm4, xmmword ptr kARGBToSepiaR
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm6, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    phaddw     xmm0, xmm6
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // 8 B values
-    movdqu     xmm5, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 G values
-    punpcklbw  xmm0, xmm5  // 8 BG values
-    movdqu     xmm5, [eax]  // R
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
-    pmaddubsw  xmm1, xmm4
-    phaddw     xmm5, xmm1
-    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5  // 8 R values
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6  // 8 A values
-    punpcklbw  xmm5, xmm6  // 8 RA values
-    movdqa     xmm1, xmm0  // Weave BG, RA together
-    punpcklwd  xmm0, xmm5  // BGRA first 4
-    punpckhwd  xmm1, xmm5  // BGRA next 4
-    movdqu     [eax], xmm0
-    movdqu     [eax + 16], xmm1
-    lea        eax, [eax + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBSEPIAROW_SSSE3
-
-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
-// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
-                                                uint8_t* dst_argb,
-                                                const int8_t* matrix_argb,
-                                                int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* matrix_argb */
-    movdqu     xmm5, [ecx]
-    pshufd     xmm2, xmm5, 0x00
-    pshufd     xmm3, xmm5, 0x55
-    pshufd     xmm4, xmm5, 0xaa
-    pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16] /* width */
-
- convertloop:
-    movdqu     xmm0, [eax]  // B
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm7, xmm2
-    movdqu     xmm6, [eax]  // G
-    movdqu     xmm1, [eax + 16]
-    pmaddubsw  xmm6, xmm3
-    pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7  // B
-    phaddsw    xmm6, xmm1  // G
-    psraw      xmm0, 6  // B
-    psraw      xmm6, 6  // G
-    packuswb   xmm0, xmm0  // 8 B values
-    packuswb   xmm6, xmm6  // 8 G values
-    punpcklbw  xmm0, xmm6  // 8 BG values
-    movdqu     xmm1, [eax]  // R
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7  // R
-    movdqu     xmm6, [eax]  // A
-    movdqu     xmm7, [eax + 16]
-    pmaddubsw  xmm6, xmm5
-    pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7  // A
-    psraw      xmm1, 6  // R
-    psraw      xmm6, 6  // A
-    packuswb   xmm1, xmm1  // 8 R values
-    packuswb   xmm6, xmm6  // 8 A values
-    punpcklbw  xmm1, xmm6  // 8 RA values
-    movdqa     xmm6, xmm0  // Weave BG, RA together
-    punpcklwd  xmm0, xmm1  // BGRA first 4
-    punpckhwd  xmm6, xmm1  // BGRA next 4
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm6
-    lea        eax, [eax + 32]
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
-
-#ifdef HAS_ARGBQUANTIZEROW_SSE2
-// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
-                                            int scale,
-                                            int interval_size,
-                                            int interval_offset,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4] /* dst_argb */
-    movd       xmm2, [esp + 8] /* scale */
-    movd       xmm3, [esp + 12] /* interval_size */
-    movd       xmm4, [esp + 16] /* interval_offset */
-    mov        ecx, [esp + 20] /* width */
-    pshuflw    xmm2, xmm2, 040h
-    pshufd     xmm2, xmm2, 044h
-    pshuflw    xmm3, xmm3, 040h
-    pshufd     xmm3, xmm3, 044h
-    pshuflw    xmm4, xmm4, 040h
-    pshufd     xmm4, xmm4, 044h
-    pxor       xmm5, xmm5  // constant 0
-    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
-    pslld      xmm6, 24
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5  // first 2 pixels
-    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
-    movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5  // next 2 pixels
-    pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3  // * interval_size
-    movdqu     xmm7, [eax]  // read 4 pixels
-    pmullw     xmm1, xmm3
-    pand       xmm7, xmm6  // mask alpha
-    paddw      xmm0, xmm4  // + interval_size / 2
-    paddw      xmm1, xmm4
-    packuswb   xmm0, xmm1
-    por        xmm0, xmm7
-    movdqu     [eax], xmm0
-    lea        eax, [eax + 16]
-    sub        ecx, 4
-    jg         convertloop
-    ret
-  }
-}
-#endif  // HAS_ARGBQUANTIZEROW_SSE2
-
-#ifdef HAS_ARGBSHADEROW_SSE2
-// Shade 4 pixels at a time by specified value.
-__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
-                                         uint8_t* dst_argb,
-                                         int width,
-                                         uint32_t value) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // width
-    movd       xmm2, [esp + 16]  // value
-    punpcklbw  xmm2, xmm2
-    punpcklqdq xmm2, xmm2
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    pmulhuw    xmm0, xmm2  // argb * value
-    pmulhuw    xmm1, xmm2  // argb * value
-    psrlw      xmm0, 8
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    ret
-  }
-}
-#endif  // HAS_ARGBSHADEROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
-    movdqu     xmm1, xmm0
-    movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0  // first 2
-    punpckhbw  xmm1, xmm1  // next 2
-    punpcklbw  xmm2, xmm5  // first 2
-    punpckhbw  xmm3, xmm5  // next 2
-    pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
-    pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
-    lea        eax, [eax + 16]
-    lea        esi, [esi + 16]
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
-
-#ifdef HAS_ARGBADDROW_SSE2
-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
-                                       const uint8_t* src_argb1,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
-    sub        ecx, 4
-    jl         convertloop49
-
- convertloop4:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1  // src_argb + src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        convertloop4
-
- convertloop49:
-    add        ecx, 4 - 1
-    jl         convertloop19
-
- convertloop1:
-    movd       xmm0, [eax]  // read 1 pixels from src_argb
-    lea        eax, [eax + 4]
-    movd       xmm1, [esi]  // read 1 pixels from src_argb1
-    lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1  // src_argb + src_argb1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        convertloop1
-
- convertloop19:
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_SSE2
-
-#ifdef HAS_ARGBSUBTRACTROW_SSE2
-// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
-    lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
-    lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1  // src_argb - src_argb1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_SSE2
-
-#ifdef HAS_ARGBMULTIPLYROW_AVX2
-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5  // constant 0
-
- convertloop:
-    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1  // low 4
-    vpunpckhbw ymm1, ymm1, ymm1  // high 4
-    vpunpcklbw ymm2, ymm3, ymm5  // low 4
-    vpunpckhbw ymm3, ymm3, ymm5  // high 4
-    vpmulhuw   ymm0, ymm0, ymm2  // src_argb * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3  // src_argb * src_argb1 high 4
-    vpackuswb  ymm0, ymm0, ymm1
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBMULTIPLYROW_AVX2
-
-#ifdef HAS_ARGBADDROW_AVX2
-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
-                                       const uint8_t* src_argb1,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBADDROW_AVX2
-
-#ifdef HAS_ARGBSUBTRACTROW_AVX2
-// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
-                                            const uint8_t* src_argb1,
-                                            uint8_t* dst_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_argb1
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-
- convertloop:
-    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
-    lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]  // src_argb - src_argb1
-    lea        esi, [esi + 32]
-    vmovdqu    [edx], ymm0
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSUBTRACTROW_AVX2
-
-#ifdef HAS_SOBELXROW_SSE2
-// SobelX as a matrix is
-// -1  0  1
-// -2  0  2
-// -1  0  1
-__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
-                                      const uint8_t* src_y1,
-                                      const uint8_t* src_y2,
-                                      uint8_t* dst_sobelx,
-                                      int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y0
-    mov        esi, [esp + 8 + 8]  // src_y1
-    mov        edi, [esp + 8 + 12]  // src_y2
-    mov        edx, [esp + 8 + 16]  // dst_sobelx
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        esi, eax
-    sub        edi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
-    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
-    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXROW_SSE2
-
-#ifdef HAS_SOBELYROW_SSE2
-// SobelY as a matrix is
-// -1 -2 -1
-//  0  0  0
-//  1  2  1
-__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
-                                      const uint8_t* src_y1,
-                                      uint8_t* dst_sobely,
-                                      int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_y0
-    mov        esi, [esp + 4 + 8]  // src_y1
-    mov        edx, [esp + 4 + 12]  // dst_sobely
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
-    pxor       xmm5, xmm5  // constant 0
-
- convertloop:
-    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
-    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm2, xmm5
-    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
-    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
-    punpcklbw  xmm2, xmm5
-    punpcklbw  xmm3, xmm5
-    psubw      xmm2, xmm3
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm1
-    paddw      xmm0, xmm1
-    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
-    psubw      xmm1, xmm0
-    pmaxsw     xmm0, xmm1
-    packuswb   xmm0, xmm0
-    movq       qword ptr [eax + edx], xmm0
-    lea        eax, [eax + 8]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELYROW_SSE2
-
-#ifdef HAS_SOBELROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
-// A = 255
-// R = Sobel
-// G = Sobel
-// B = Sobel
-__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
-                                     const uint8_t* src_sobely,
-                                     uint8_t* dst_argb,
-                                     int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5  // alpha 255
-    pslld      xmm5, 24  // 0xff000000
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0  // GG
-    punpcklbw  xmm2, xmm0  // First 8
-    punpckhbw  xmm0, xmm0  // Next 8
-    movdqa     xmm1, xmm2  // GGGG
-    punpcklwd  xmm1, xmm2  // First 4
-    punpckhwd  xmm2, xmm2  // Next 4
-    por        xmm1, xmm5  // GGGA
-    por        xmm2, xmm5
-    movdqa     xmm3, xmm0  // GGGG
-    punpcklwd  xmm3, xmm0  // Next 4
-    punpckhwd  xmm0, xmm0  // Last 4
-    por        xmm3, xmm5  // GGGA
-    por        xmm0, xmm5
-    movdqu     [edx], xmm1
-    movdqu     [edx + 16], xmm2
-    movdqu     [edx + 32], xmm3
-    movdqu     [edx + 48], xmm0
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELROW_SSE2
-
-#ifdef HAS_SOBELTOPLANEROW_SSE2
-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
-                                            const uint8_t* src_sobely,
-                                            uint8_t* dst_y,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELTOPLANEROW_SSE2
-
-#ifdef HAS_SOBELXYROW_SSE2
-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
-// A = 255
-// R = Sobel X
-// G = Sobel
-// B = Sobel Y
-__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
-                                       const uint8_t* src_sobely,
-                                       uint8_t* dst_argb,
-                                       int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_sobelx
-    mov        esi, [esp + 4 + 8]  // src_sobely
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    pcmpeqb    xmm5, xmm5  // alpha 255
-
- convertloop:
-    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
-    lea        eax, [eax + 16]
-    movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0  // XA
-    punpcklbw  xmm3, xmm5
-    punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1  // YS
-    punpcklbw  xmm4, xmm2
-    punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4  // YSXA
-    punpcklwd  xmm6, xmm3  // First 4
-    punpckhwd  xmm4, xmm3  // Next 4
-    movdqa     xmm7, xmm1  // YSXA
-    punpcklwd  xmm7, xmm0  // Next 4
-    punpckhwd  xmm1, xmm0  // Last 4
-    movdqu     [edx], xmm6
-    movdqu     [edx + 16], xmm4
-    movdqu     [edx + 32], xmm7
-    movdqu     [edx + 48], xmm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_SOBELXYROW_SSE2
-
-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-// Consider float CumulativeSum.
-// Consider calling CumulativeSum one row at time as needed.
-// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
-// Convert cumulative sum for an area to an average for 1 pixel.
-// topleft is pointer to top left of CumulativeSum buffer for area.
-// botleft is pointer to bottom left of CumulativeSum buffer.
-// width is offset from left to right of area in CumulativeSum buffer measured
-//   in number of ints.
-// area is the number of pixels in the area being averaged.
-// dst points to pixel to store result to.
-// count is number of averaged pixels to produce.
-// Does 4 pixels at a time.
-// This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
-                                    const int32_t* botleft,
-                                    int width,
-                                    int area,
-                                    uint8_t* dst,
-                                    int count) {
-  __asm {
-    mov        eax, topleft  // eax topleft
-    mov        esi, botleft  // esi botleft
-    mov        edx, width
-    movd       xmm5, area
-    mov        edi, dst
-    mov        ecx, count
-    cvtdq2ps   xmm5, xmm5
-    rcpss      xmm4, xmm5  // 1.0f / area
-    pshufd     xmm4, xmm4, 0
-    sub        ecx, 4
-    jl         l4b
-
-    cmp        area, 128  // 128 pixels will not overflow 15 bits.
-    ja         l4
-
-    pshufd     xmm5, xmm5, 0  // area
-    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
-    psrld      xmm6, 16
-    cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6  // (65536.0 + area - 1)
-    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
-    packssdw   xmm5, xmm5  // 16 bit shorts
-
-        // 4 pixel loop small blocks.
-  s4:
-        // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
-    packssdw   xmm2, xmm3
-
-    pmulhuw    xmm0, xmm5
-    pmulhuw    xmm2, xmm5
-
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        s4
-
-    jmp        l4b
-
-            // 4 pixel loop
-  l4:
-        // top left
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-
-    // - top right
-    psubd      xmm0, [eax + edx * 4]
-    psubd      xmm1, [eax + edx * 4 + 16]
-    psubd      xmm2, [eax + edx * 4 + 32]
-    psubd      xmm3, [eax + edx * 4 + 48]
-    lea        eax, [eax + 64]
-
-    // - bottom left
-    psubd      xmm0, [esi]
-    psubd      xmm1, [esi + 16]
-    psubd      xmm2, [esi + 32]
-    psubd      xmm3, [esi + 48]
-
-    // + bottom right
-    paddd      xmm0, [esi + edx * 4]
-    paddd      xmm1, [esi + edx * 4 + 16]
-    paddd      xmm2, [esi + edx * 4 + 32]
-    paddd      xmm3, [esi + edx * 4 + 48]
-    lea        esi, [esi + 64]
-
-    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
-    cvtdq2ps   xmm1, xmm1
-    mulps      xmm0, xmm4
-    mulps      xmm1, xmm4
-    cvtdq2ps   xmm2, xmm2
-    cvtdq2ps   xmm3, xmm3
-    mulps      xmm2, xmm4
-    mulps      xmm3, xmm4
-    cvtps2dq   xmm0, xmm0
-    cvtps2dq   xmm1, xmm1
-    cvtps2dq   xmm2, xmm2
-    cvtps2dq   xmm3, xmm3
-    packssdw   xmm0, xmm1
-    packssdw   xmm2, xmm3
-    packuswb   xmm0, xmm2
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    movdqu     xmm0, [eax]
-    psubd      xmm0, [eax + edx * 4]
-    lea        eax, [eax + 16]
-    psubd      xmm0, [esi]
-    paddd      xmm0, [esi + edx * 4]
-    lea        esi, [esi + 16]
-    cvtdq2ps   xmm0, xmm0
-    mulps      xmm0, xmm4
-    cvtps2dq   xmm0, xmm0
-    packssdw   xmm0, xmm0
-    packuswb   xmm0, xmm0
-    movd       dword ptr [edi], xmm0
-    lea        edi, [edi + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-  }
-}
-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-
-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
-// Creates a table of cumulative sums where each value is a sum of all values
-// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
-                                  int32_t* cumsum,
-                                  const int32_t* previous_cumsum,
-                                  int width) {
-  __asm {
-    mov        eax, row
-    mov        edx, cumsum
-    mov        esi, previous_cumsum
-    mov        ecx, width
-    pxor       xmm0, xmm0
-    pxor       xmm1, xmm1
-
-    sub        ecx, 4
-    jl         l4b
-    test       edx, 15
-    jne        l4b
-
-        // 4 pixel loop
-  l4:
-    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
-    lea        eax, [eax + 16]
-    movdqa     xmm4, xmm2
-
-    punpcklbw  xmm2, xmm1
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm2, xmm1
-    punpckhwd  xmm3, xmm1
-
-    punpckhbw  xmm4, xmm1
-    movdqa     xmm5, xmm4
-    punpcklwd  xmm4, xmm1
-    punpckhwd  xmm5, xmm1
-
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]  // previous row above.
-    paddd      xmm2, xmm0
-
-    paddd      xmm0, xmm3
-    movdqu     xmm3, [esi + 16]
-    paddd      xmm3, xmm0
-
-    paddd      xmm0, xmm4
-    movdqu     xmm4, [esi + 32]
-    paddd      xmm4, xmm0
-
-    paddd      xmm0, xmm5
-    movdqu     xmm5, [esi + 48]
-    lea        esi, [esi + 64]
-    paddd      xmm5, xmm0
-
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm3
-    movdqu     [edx + 32], xmm4
-    movdqu     [edx + 48], xmm5
-
-    lea        edx, [edx + 64]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel
-    lea        eax, [eax + 4]
-    punpcklbw  xmm2, xmm1
-    punpcklwd  xmm2, xmm1
-    paddd      xmm0, xmm2
-    movdqu     xmm2, [esi]
-    lea        esi, [esi + 16]
-    paddd      xmm2, xmm0
-    movdqu     [edx], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 1
-    jge        l1
-
- l1b:
-  }
-}
-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
-
-#ifdef HAS_ARGBAFFINEROW_SSE2
-// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
-                                                     int src_argb_stride,
-                                                     uint8_t* dst_argb,
-                                                     const float* uv_dudv,
-                                                     int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 12]  // src_argb
-    mov        esi, [esp + 16]  // stride
-    mov        edx, [esp + 20]  // dst_argb
-    mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]  // uv
-    movq       xmm7, qword ptr [ecx + 8]  // dudv
-    mov        ecx, [esp + 28]  // width
-    shl        esi, 16  // 4, stride
-    add        esi, 4
-    movd       xmm5, esi
-    sub        ecx, 4
-    jl         l4b
-
-        // setup for 4 pixel loop
-    pshufd     xmm7, xmm7, 0x44  // dup dudv
-    pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2  // x0, y0, x1, y1
-    addps      xmm0, xmm7
-    movlhps    xmm2, xmm0
-    movdqa     xmm4, xmm7
-    addps      xmm4, xmm4  // dudv *= 2
-    movdqa     xmm3, xmm2  // x2, y2, x3, y3
-    addps      xmm3, xmm4
-    addps      xmm4, xmm4  // dudv *= 4
-
-        // 4 pixel loop
-  l4:
-    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
-    packssdw   xmm0, xmm1  // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       xmm1, [eax + esi]  // read pixel 0
-    movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
-    addps      xmm2, xmm4  // x, y += dx, dy first 2
-    movq       qword ptr [edx], xmm1
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // shift right
-    movd       edi, xmm0
-    movd       xmm6, [eax + esi]  // read pixel 2
-    movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
-    addps      xmm3, xmm4  // x, y += dx, dy next 2
-    movq       qword ptr 8[edx], xmm6
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jge        l4
-
-  l4b:
-    add        ecx, 4 - 1
-    jl         l1b
-
-            // 1 pixel loop
-  l1:
-    cvttps2dq  xmm0, xmm2  // x, y float to int
-    packssdw   xmm0, xmm0  // x, y as shorts
-    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
-    addps      xmm2, xmm7  // x, y += dx, dy
-    movd       esi, xmm0
-    movd       xmm0, [eax + esi]  // copy a pixel
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jge        l1
-  l1b:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBAFFINEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 32x2 -> 32x1
-__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                                           const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           int dst_width,
-                                           int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    sub        edi, esi
-    cmp        eax, 128
-    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
-
-    vmovd      xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    vmovd      xmm5, eax  // low fraction 256..1
-    vpunpcklbw xmm5, xmm5, xmm0
-    vpunpcklwd xmm5, xmm5, xmm5
-    vbroadcastss ymm5, xmm5
-
-    mov        eax, 0x80808080  // 128b for bias and rounding.
-    vmovd      xmm4, eax
-    vbroadcastss ymm4, xmm4
-
-  xloop:
-    vmovdqu    ymm0, [esi]
-    vmovdqu    ymm2, [esi + edx]
-    vpunpckhbw ymm1, ymm0, ymm2  // mutates
-    vpunpcklbw ymm0, ymm0, ymm2
-    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
-    vpsubb     ymm0, ymm0, ymm4
-    vpmaddubsw ymm1, ymm5, ymm1
-    vpmaddubsw ymm0, ymm5, ymm0
-    vpaddw     ymm1, ymm1, ymm4  // unbias and round
-    vpaddw     ymm0, ymm0, ymm4
-    vpsrlw     ymm1, ymm1, 8
-    vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1            // unmutates
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    sub        ecx, 32
-    jg         xloop
-    jmp        xloop99
-
-        // Blend 50 / 50.
- xloop50:
-   vmovdqu    ymm0, [esi]
-   vpavgb     ymm0, ymm0, [esi + edx]
-   vmovdqu    [esi + edi], ymm0
-   lea        esi, [esi + 32]
-   sub        ecx, 32
-   jg         xloop50
-   jmp        xloop99
-
-        // Blend 100 / 0 - Copy row unchanged.
- xloop100:
-   rep movsb
-
-  xloop99:
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-// Bilinear filter 16x2 -> 16x1
-// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
-                                            const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            int dst_width,
-                                            int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-
-    mov        edi, [esp + 8 + 4]  // dst_ptr
-    mov        esi, [esp + 8 + 8]  // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-        // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 /256.  Blend 100 / 0.
-    cmp        eax, 128
-    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
-
-    movd       xmm0, eax  // high fraction 0..255
-    neg        eax
-    add        eax, 256
-    movd       xmm5, eax  // low fraction 255..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-    mov        eax, 0x80808080  // 128 for biasing image to signed.
-    movd       xmm4, eax
-    pshufd     xmm4, xmm4, 0x00
-
-  xloop:
-    movdqu     xmm0, [esi]
-    movdqu     xmm2, [esi + edx]
-    movdqu     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4            // bias image by -128
-    psubb      xmm1, xmm4
-    movdqa     xmm2, xmm5
-    movdqa     xmm3, xmm5
-    pmaddubsw  xmm2, xmm0
-    pmaddubsw  xmm3, xmm1
-    paddw      xmm2, xmm4
-    paddw      xmm3, xmm4
-    psrlw      xmm2, 8
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqu     [esi + edi], xmm2
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop
-    jmp        xloop99
-
-        // Blend 50 / 50.
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop50
-    jmp        xloop99
-
-        // Blend 100 / 0 - Copy row unchanged.
-  xloop100:
-    movdqu     xmm0, [esi]
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    sub        ecx, 16
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
-                                            uint8_t* dst_argb,
-                                            const uint8_t* shuffler,
-                                            int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // shuffler
-    movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]  // width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-    ret
-  }
-}
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                                           uint8_t* dst_argb,
-                                           const uint8_t* shuffler,
-                                           int width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    mov        edx, [esp + 8]  // dst_argb
-    mov        ecx, [esp + 12]  // shuffler
-    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
-    mov        ecx, [esp + 16]  // width
-
-  wloop:
-    vmovdqu    ymm0, [eax]
-    vmovdqu    ymm1, [eax + 32]
-    lea        eax, [eax + 64]
-    vpshufb    ymm0, ymm0, ymm5
-    vpshufb    ymm1, ymm1, ymm5
-    vmovdqu    [edx], ymm0
-    vmovdqu    [edx + 32], ymm1
-    lea        edx, [edx + 64]
-    sub        ecx, 16
-    jg         wloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX2
-
-// YUY2 - Macro-pixel = 2 image pixels
-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-
-// UYVY - Macro-pixel = 2 image pixels
-// U0Y0V0Y1
-
-__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
-                                          const uint8_t* src_u,
-                                          const uint8_t* src_v,
-                                          uint8_t* dst_frame,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
-    mov        edx, [esp + 8 + 12]  // src_v
-    mov        edi, [esp + 8 + 16]  // dst_frame
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi]  // U
-    movq       xmm3, qword ptr [esi + edx]  // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
-    movdqu     xmm0, [eax]  // Y
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2  // YUYV
-    punpckhbw  xmm1, xmm2
-    movdqu     [edi], xmm0
-    movdqu     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
-                                          const uint8_t* src_u,
-                                          const uint8_t* src_v,
-                                          uint8_t* dst_frame,
-                                          int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_y
-    mov        esi, [esp + 8 + 8]  // src_u
-    mov        edx, [esp + 8 + 12]  // src_v
-    mov        edi, [esp + 8 + 16]  // dst_frame
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edx, esi
-
-  convertloop:
-    movq       xmm2, qword ptr [esi]  // U
-    movq       xmm3, qword ptr [esi + edx]  // V
-    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3  // UV
-    movdqu     xmm0, [eax]  // Y
-    movdqa     xmm1, xmm2
-    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0  // UYVY
-    punpckhbw  xmm2, xmm0
-    movdqu     [edi], xmm1
-    movdqu     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              const float* poly,
-                                              int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* src_argb */
-    mov        edx, [esp + 4 + 8] /* dst_argb */
-    mov        esi, [esp + 4 + 12] /* poly */
-    mov        ecx, [esp + 4 + 16] /* width */
-    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
-
-        // 2 pixel loop.
- convertloop:
-        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
-    movq       xmm0, qword ptr [eax]  // BGRABGRA
-    lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm3
-    movdqa     xmm4, xmm0
-    punpcklwd  xmm0, xmm3  // pixel 0
-    punpckhwd  xmm4, xmm3  // pixel 1
-    cvtdq2ps   xmm0, xmm0  // 4 floats
-    cvtdq2ps   xmm4, xmm4
-    movdqa     xmm1, xmm0  // X
-    movdqa     xmm5, xmm4
-    mulps      xmm0, [esi + 16]  // C1 * X
-    mulps      xmm4, [esi + 16]
-    addps      xmm0, [esi]  // result = C0 + C1 * X
-    addps      xmm4, [esi]
-    movdqa     xmm2, xmm1
-    movdqa     xmm6, xmm5
-    mulps      xmm2, xmm1  // X * X
-    mulps      xmm6, xmm5
-    mulps      xmm1, xmm2  // X * X * X
-    mulps      xmm5, xmm6
-    mulps      xmm2, [esi + 32]  // C2 * X * X
-    mulps      xmm6, [esi + 32]
-    mulps      xmm1, [esi + 48]  // C3 * X * X * X
-    mulps      xmm5, [esi + 48]
-    addps      xmm0, xmm2  // result += C2 * X * X
-    addps      xmm4, xmm6
-    addps      xmm0, xmm1  // result += C3 * X * X * X
-    addps      xmm4, xmm5
-    cvttps2dq  xmm0, xmm0
-    cvttps2dq  xmm4, xmm4
-    packuswb   xmm0, xmm4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 2
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
-
-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
-                                              uint8_t* dst_argb,
-                                              const float* poly,
-                                              int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src_argb */
-    mov        edx, [esp + 8] /* dst_argb */
-    mov        ecx, [esp + 12] /* poly */
-    vbroadcastf128 ymm4, [ecx]  // C0
-    vbroadcastf128 ymm5, [ecx + 16]  // C1
-    vbroadcastf128 ymm6, [ecx + 32]  // C2
-    vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16] /* width */
-
-    // 2 pixel loop.
- convertloop:
-    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
-    lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0  // X 8 floats
-    vmulps      ymm2, ymm0, ymm0  // X * X
-    vmulps      ymm3, ymm0, ymm7  // C3 * X
-    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
-    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
-    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
-    vcvttps2dq  ymm0, ymm0
-    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
-    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
-    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
-    vmovq       qword ptr [edx], xmm0
-    lea         edx, [edx + 8]
-    sub         ecx, 2
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_SSE2
-static float kExpBias = 1.9259299444e-34f;
-__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    movd       xmm4, dword ptr [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-    mulss      xmm4, kExpBias
-    pshufd     xmm4, xmm4, 0
-    pxor       xmm5, xmm5
-    sub        edx, eax
-
-        // 8 pixel loop.
- convertloop:
-    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
-    add         eax, 16
-    movdqa      xmm3, xmm2
-    punpcklwd   xmm2, xmm5
-    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
-    punpckhwd   xmm3, xmm5
-    cvtdq2ps    xmm3, xmm3
-    mulps       xmm2, xmm4
-    mulps       xmm3, xmm4
-    psrld       xmm2, 13
-    psrld       xmm3, 13
-    packssdw    xmm2, xmm3
-    movdqu      [eax + edx - 16], xmm2
-    sub         ecx, 8
-    jg          convertloop
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_SSE2
-
-#ifdef HAS_HALFFLOATROW_AVX2
-__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    movd       xmm4, dword ptr [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-
-    vmulss     xmm4, xmm4, kExpBias
-    vbroadcastss ymm4, xmm4
-    vpxor      ymm5, ymm5, ymm5
-    sub        edx, eax
-
-        // 16 pixel loop.
- convertloop:
-    vmovdqu     ymm2, [eax]  // 16 shorts
-    add         eax, 32
-    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
-    vpunpcklwd  ymm2, ymm2, ymm5
-    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
-    vcvtdq2ps   ymm2, ymm2
-    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
-    vmulps      ymm2, ymm2, ymm4
-    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
-    vpsrld      ymm2, ymm2, 13
-    vpackssdw   ymm2, ymm2, ymm3
-    vmovdqu     [eax + edx - 32], ymm2
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_AVX2
-
-#ifdef HAS_HALFFLOATROW_F16C
-__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
-                                         uint16_t* dst,
-                                         float scale,
-                                         int width) {
-  __asm {
-    mov        eax, [esp + 4] /* src */
-    mov        edx, [esp + 8] /* dst */
-    vbroadcastss ymm4, [esp + 12] /* scale */
-    mov        ecx, [esp + 16] /* width */
-    sub        edx, eax
-
-        // 16 pixel loop.
- convertloop:
-    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
-    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
-    add         eax, 32
-    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
-    vcvtdq2ps   ymm3, ymm3
-    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
-    vmulps      ymm3, ymm3, ymm4
-    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
-    vcvtps2ph   xmm3, ymm3, 3
-    vmovdqu     [eax + edx + 32], xmm2
-    vmovdqu     [eax + edx + 32 + 16], xmm3
-    sub         ecx, 16
-    jg          convertloop
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFFLOATROW_F16C
-
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
-                                             const uint8_t* table_argb,
-                                             int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
-    mov        ecx, [esp + 4 + 12] /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    movzx      edx, byte ptr [eax - 4 + 3]
-    movzx      edx, byte ptr [esi + edx * 4 + 3]
-    mov        byte ptr [eax - 4 + 3], dl
-    dec        ecx
-    jg         convertloop
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
-#ifdef HAS_RGBCOLORTABLEROW_X86
-// Tranform RGB pixels with color table.
-__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
-                                            const uint8_t* table_argb,
-                                            int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4] /* dst_argb */
-    mov        esi, [esp + 4 + 8] /* table_argb */
-    mov        ecx, [esp + 4 + 12] /* width */
-
-    // 1 pixel loop.
-  convertloop:
-    movzx      edx, byte ptr [eax]
-    lea        eax, [eax + 4]
-    movzx      edx, byte ptr [esi + edx * 4]
-    mov        byte ptr [eax - 4], dl
-    movzx      edx, byte ptr [eax - 4 + 1]
-    movzx      edx, byte ptr [esi + edx * 4 + 1]
-    mov        byte ptr [eax - 4 + 1], dl
-    movzx      edx, byte ptr [eax - 4 + 2]
-    movzx      edx, byte ptr [esi + edx * 4 + 2]
-    mov        byte ptr [eax - 4 + 2], dl
-    dec        ecx
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_RGBCOLORTABLEROW_X86
-
-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
-// Tranform RGB pixels with luma table.
-__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
-                                                   uint8_t* dst_argb,
-                                                   int width,
-                                                   const uint8_t* luma,
-                                                   uint32_t lumacoeff) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4] /* src_argb */
-    mov        edi, [esp + 8 + 8] /* dst_argb */
-    mov        ecx, [esp + 8 + 12] /* width */
-    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
-    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
-    pshufd     xmm2, xmm2, 0
-    pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
-    psllw      xmm4, 8
-    pxor       xmm5, xmm5
-
-        // 4 pixel loop.
-  convertloop:
-    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
-    pmaddubsw  xmm0, xmm3
-    phaddw     xmm0, xmm0
-    pand       xmm0, xmm4  // mask out low bits
-    punpcklwd  xmm0, xmm5
-    paddd      xmm0, xmm2  // add table base
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi], dl
-    movzx      edx, byte ptr [eax + 1]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 1], dl
-    movzx      edx, byte ptr [eax + 2]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 2], dl
-    movzx      edx, byte ptr [eax + 3]  // copy alpha.
-    mov        byte ptr [edi + 3], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 4]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 4], dl
-    movzx      edx, byte ptr [eax + 5]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 5], dl
-    movzx      edx, byte ptr [eax + 6]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 6], dl
-    movzx      edx, byte ptr [eax + 7]  // copy alpha.
-    mov        byte ptr [edi + 7], dl
-
-    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
-
-    movzx      edx, byte ptr [eax + 8]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 8], dl
-    movzx      edx, byte ptr [eax + 9]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 9], dl
-    movzx      edx, byte ptr [eax + 10]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 10], dl
-    movzx      edx, byte ptr [eax + 11]  // copy alpha.
-    mov        byte ptr [edi + 11], dl
-
-    movd       esi, xmm0
-
-    movzx      edx, byte ptr [eax + 12]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 12], dl
-    movzx      edx, byte ptr [eax + 13]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 13], dl
-    movzx      edx, byte ptr [eax + 14]
-    movzx      edx, byte ptr [esi + edx]
-    mov        byte ptr [edi + 14], dl
-    movzx      edx, byte ptr [eax + 15]  // copy alpha.
-    mov        byte ptr [edi + 15], dl
-
-    lea        eax, [eax + 16]
-    lea        edi, [edi + 16]
-    sub        ecx, 4
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-
-#endif  // defined(_M_X64)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
index 591a6a938..80b030dc2 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
@@ -1118,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1313,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc
index 8d2509474..ddd8d29ed 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc
@@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -467,6 +472,11 @@ static void ScaleARGBBilinearUp(int src_width,
       InterpolateRow = InterpolateRow_LSX;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
 #endif
   if (src_width >= 32768) {
     ScaleARGBFilterCols =
@@ -724,6 +734,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 
   void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                               int dst_width, int x, int dx) =
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc
index 5e603fd40..774559032 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc
@@ -1678,6 +1678,12 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc
deleted file mode 100644
index 1226ef3ea..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "libyuv/cpu_id.h"
-#include "libyuv/planar_functions.h"  // For CopyARGB
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for Mips MMI.
-#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-// clang-format off
-
-// CPU agnostic row functions
-void ScaleRowDown2_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlh      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlh      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packushb   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t* dst,
-                             int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src0],          0x07(%[src_ptr])                \n\t"
-      "and        %[dest0],         %[src0],          %[mask]       \n\t"
-      "gsldrc1    %[src1],          0x08(%[src_ptr])                \n\t"
-      "gsldlc1    %[src1],          0x0f(%[src_ptr])                \n\t"
-      "and        %[dest1],         %[src1],          %[mask]       \n\t"
-      "packushb   %[dest0],         %[dest0],         %[dest1]      \n\t"
-
-      "psrlh      %[src0],          %[src0],          %[shift]      \n\t"
-      "psrlh      %[src1],          %[src1],          %[shift]      \n\t"
-      "packushb   %[dest1],         %[src0],          %[src1]       \n\t"
-
-      "pavgb      %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],        0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x08         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
-        [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
-        [shift] "f"(shift), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* s = src_ptr;
-  const uint8_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, t0, t1;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t mask = 0x00ff00ff00ff00ffULL;
-  const uint64_t shift0 = 0x2ULL;
-  const uint64_t shift1 = 0x8ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddh      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlh      %[dest0],         %[dest0],         %[shift0]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlh      %[s1],            %[s0],            %[shift1]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlh      %[t1],            %[t0],            %[shift1]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddh      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddh      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlh      %[dest1],         %[dest1],         %[shift0]     \n\t"
-
-      "packushb   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
-                           ptrdiff_t src_stride,
-                           uint8_t* dst_argb,
-                           int dst_width) {
-  (void)src_stride;
-
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhwd  %[dest],         %[src0],           %[src1]       \n\t"
-
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],         0x00(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x08(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "lwc1       %[src0],         0x04(%[src_ptr])                 \n\t"
-      "lwc1       %[src1],         0x0c(%[src_ptr])                 \n\t"
-      "punpcklwd  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "pavgb      %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  const uint8_t* s = src_argb;
-  const uint8_t* t = src_argb + src_stride;
-
-  uint64_t s0, s_hi, s_lo;
-  uint64_t t0, t_hi, t_lo;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shfit = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_lo]        \n\t"
-      "paddh      %[dest_lo],       %[dest_lo],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_lo],      %[dest_lo],       %[ph]          \n\t"
-      "psrlh      %[dest_lo],      %[dest_lo],       %[shfit]       \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "punpcklbh  %[s_lo],          %[s0],           %[mask]        \n\t"
-      "punpckhbh  %[s_hi],          %[s0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[s_lo],         %[s_hi]        \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "punpcklbh  %[t_lo],          %[t0],           %[mask]        \n\t"
-      "punpckhbh  %[t_hi],          %[t0],           %[mask]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_lo]        \n\t"
-      "paddh      %[dest_hi],       %[dest_hi],      %[t_hi]        \n\t"
-
-      "paddh      %[dest_hi],      %[dest_hi],       %[ph]          \n\t"
-      "psrlh      %[dest_hi],      %[dest_hi],       %[shfit]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],       %[dest_hi]     \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[s],            %[s],              0x10          \n\t"
-      "daddiu     %[t],            %[t],              0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
-        [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
-      : "memory");
-}
-
-void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-  const uint64_t shift = 0x10ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-
-      "packsswh   %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift)
-      : "memory");
-}
-
-void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint16_t* dst,
-                                int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpcklhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "punpcklhw  %[src0],         %[dest_lo],        %[dest_hi]    \n\t"
-      "punpckhhw  %[src1],         %[dest_lo],        %[dest_hi]    \n\t"
-
-      "pavgh      %[dest],         %[src0],           %[src1]       \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x10          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* s = src_ptr;
-  const uint16_t* t = src_ptr + src_stride;
-
-  uint64_t s0, s1, s_hi, s_lo;
-  uint64_t t0, t1, t_hi, t_lo;
-  uint64_t dest, dest0, dest1;
-
-  const uint64_t ph = 0x0000000200000002ULL;
-  const uint64_t mask = 0x0000ffff0000ffffULL;
-  const uint64_t shift0 = 0x10ULL;
-  const uint64_t shift1 = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[s0],            0x00(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x07(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x00(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x07(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest0],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t0]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[t1]         \n\t"
-      "paddw      %[dest0],         %[dest0],         %[ph]         \n\t"
-      "psrlw      %[dest0],         %[dest0],         %[shift1]     \n\t"
-
-      "gsldrc1    %[s0],            0x08(%[s])                      \n\t"
-      "gsldlc1    %[s0],            0x0f(%[s])                      \n\t"
-      "psrlw      %[s1],            %[s0],            %[shift0]     \n\t"
-      "and        %[s0],            %[s0],            %[mask]       \n\t"
-
-      "gsldrc1    %[t0],            0x08(%[t])                      \n\t"
-      "gsldlc1    %[t0],            0x0f(%[t])                      \n\t"
-      "psrlw      %[t1],            %[t0],            %[shift0]     \n\t"
-      "and        %[t0],            %[t0],            %[mask]       \n\t"
-
-      "paddw      %[dest1],         %[s0],            %[s1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t0]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[t1]         \n\t"
-      "paddw      %[dest1],         %[dest1],         %[ph]         \n\t"
-      "psrlw      %[dest1],         %[dest1],         %[shift1]     \n\t"
-
-      "packsswh   %[dest],          %[dest0],         %[dest1]      \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "daddiu     %[s],             %[s],              0x10         \n\t"
-      "daddiu     %[t],             %[t],              0x10         \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],        0x08         \n\t"
-      "daddi      %[width],         %[width],         -0x04         \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
-        [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
-        [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
-        [dest] "=&f"(dest)
-      : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_MMI(const uint8_t* src_ptr,
-                       ptrdiff_t src_stride,
-                       uint8_t* dst,
-                       int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t shift = 0x10ULL;
-  const uint64_t mask = 0x000000ff000000ffULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_lo],      %[src0],           %[src1]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "psrlw      %[src0],         %[src0],           %[shift]      \n\t"
-      "and        %[src0],         %[src0],           %[mask]       \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "psrlw      %[src1],         %[src1],           %[shift]      \n\t"
-      "and        %[src1],         %[src1],           %[mask]       \n\t"
-      "packsswh   %[dest_hi],      %[src0],           %[src1]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],         %[dest_hi]   \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [shift] "f"(shift), [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint16_t* dst,
-                          int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest_hi, dest_lo;
-
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_lo],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_lo],      %[dest_lo],        %[mask]       \n\t"
-
-      "gsldrc1    %[src0],         0x10(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src0],         0x17(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src1],         0x18(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src1],         0x1f(%[src_ptr])                 \n\t"
-      "punpckhhw  %[dest_hi],      %[src0],           %[src1]       \n\t"
-      "punpcklhw  %[dest_hi],      %[dest_hi],        %[mask]       \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x20          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
-        [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_PUNPCKADD()                              \
-  "punpcklbh  %[src_lo],       %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],           %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_LOOP(reg)                                \
-  "ldc1       %[src],          0x00(%[src0_ptr])               \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],           %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],           %[mask0]      \n\t" \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src1_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src2_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "ldc1       %[src],          0x00(%[src3_ptr])               \n\t" \
-  DO_SCALEROWDOWN4BOX_PUNPCKADD()                                    \
-                                                                     \
-  "pmaddhw    %[dest_lo],      %[dest_lo],       %[mask1]      \n\t" \
-  "pmaddhw    %[dest_hi],      %[dest_hi],       %[mask1]      \n\t" \
-  "packsswh   " #reg   ",      %[dest_lo],       %[dest_hi]    \n\t" \
-  "pmaddhw    " #reg   ",      " #reg   ",       %[mask1]      \n\t" \
-  "paddh      " #reg   ",      " #reg   ",       %[ph]         \n\t" \
-  "psrlh      " #reg   ",      " #reg   ",       %[shift]      \n\t" \
-                                                                     \
-  "daddiu     %[src0_ptr],     %[src0_ptr],      0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],      0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],      0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],      0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box */
-void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t* dst,
-                          int dst_width) {
-  const uint8_t* src0_ptr = src_ptr;
-  const uint8_t* src1_ptr = src_ptr + src_stride;
-  const uint8_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint8_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x0001000100010001ULL;
-  const uint64_t ph = 0x0008000800080008ULL;
-  const uint64_t shift = 0x4ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
-
-      "packsswh   %[dest_lo],      %[dest0],          %[dest1]      \n\t"
-      "packsswh   %[dest_hi],      %[dest2],          %[dest3]      \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi]    \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                            \
-  "punpcklbh  %[src_lo],       %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[src_hi],       %[src],            %[mask0]      \n\t" \
-  "paddh      %[dest_lo],      %[dest_lo],        %[src_lo]     \n\t" \
-  "paddh      %[dest_hi],      %[dest_hi],        %[src_hi]     \n\t"
-
-#define DO_SCALEROWDOWN4BOX_16_LOOP(reg)                              \
-  "ldc1       %[src],          0x00(%[src0_ptr])                \n\t" \
-  "punpcklbh  %[dest_lo],      %[src],            %[mask0]      \n\t" \
-  "punpckhbh  %[dest_hi],      %[src],            %[mask0]      \n\t" \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src1_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src2_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "ldc1       %[src],          0x00(%[src3_ptr])                \n\t" \
-  DO_SCALEROWDOWN4BOX_16_PUNPCKADD()                                  \
-                                                                      \
-  "paddw      %[dest],         %[dest_lo],        %[dest_hi]    \n\t" \
-  "punpckhwd  %[dest_hi],      %[dest],           %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest_hi],        %[dest]       \n\t" \
-  "paddw      %[dest],         %[dest],           %[ph]         \n\t" \
-  "psraw      %[dest],         %[dest],           %[shift]      \n\t" \
-  "and        " #reg ",        %[dest],           %[mask1]      \n\t" \
-                                                                      \
-  "daddiu     %[src0_ptr],     %[src0_ptr],       0x08          \n\t" \
-  "daddiu     %[src1_ptr],     %[src1_ptr],       0x08          \n\t" \
-  "daddiu     %[src2_ptr],     %[src2_ptr],       0x08          \n\t" \
-  "daddiu     %[src3_ptr],     %[src3_ptr],       0x08          \n\t"
-
-/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
-void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
-                             ptrdiff_t src_stride,
-                             uint16_t* dst,
-                             int dst_width) {
-  const uint16_t* src0_ptr = src_ptr;
-  const uint16_t* src1_ptr = src_ptr + src_stride;
-  const uint16_t* src2_ptr = src_ptr + src_stride * 2;
-  const uint16_t* src3_ptr = src_ptr + src_stride * 3;
-
-  uint64_t src, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
-
-  const uint64_t mask0 = 0x0ULL;
-  const uint64_t mask1 = 0x00000000ffffffffULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 0x04ULL;
-
-  __asm__ volatile(
-      "1:                                                        \n\t"
-
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
-      DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
-      "punpcklwd  %[dest_lo],      %[dest0],          %[dest1]   \n\t"
-      "punpcklwd  %[dest_hi],      %[dest2],          %[dest3]   \n\t"
-
-      "packushb   %[dest],         %[dest_lo],        %[dest_hi] \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])              \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])              \n\t"
-
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08       \n\t"
-      "daddi      %[width],        %[width],         -0x04       \n\t"
-      "bnez       %[width],        1b                            \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
-        [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
-        [ph] "f"(ph), [mask1] "f"(mask1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_MMI(uint8_t* dst_ptr,
-                      const uint8_t* src_ptr,
-                      int dst_width,
-                      int x,
-                      int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src],          0x00(%[src_ptr])                 \n\t"
-
-      "punpcklbh  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         int dst_width,
-                         int x,
-                         int dx) {
-  uint64_t src, dest;
-
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-
-      "punpcklhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "punpckhhw  %[dest],         %[src],            %[src]        \n\t"
-      "gssdlc1    %[dest],         0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src] "=&f"(src), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "punpcklbh  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhbh  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddush    %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddush    %[dest1],        %[dest1],          %[src_hi]     \n\t"
-
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x08          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
-                        uint32_t* dst_ptr,
-                        int src_width) {
-  uint64_t src, src_hi, src_lo, dest0, dest1;
-  const uint64_t mask = 0x0ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],          0x00(%[src_ptr])                 \n\t"
-      "gsldlc1    %[src],          0x07(%[src_ptr])                 \n\t"
-      "punpcklhw  %[src_lo],       %[src],            %[mask]       \n\t"
-      "punpckhhw  %[src_hi],       %[src],            %[mask]       \n\t"
-
-      "gsldrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "paddw      %[dest0],        %[dest0],          %[src_lo]     \n\t"
-      "gssdlc1    %[dest0],        0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest0],        0x00(%[dst_ptr])                 \n\t"
-
-      "gsldrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-      "gsldlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "paddw      %[dest1],        %[dest1],          %[src_hi]     \n\t"
-      "gssdlc1    %[dest1],        0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest1],        0x08(%[dst_ptr])                 \n\t"
-
-      "daddiu     %[src_ptr],      %[src_ptr],        0x08          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
-        [src_lo] "=&f"(src_lo), [src] "=&f"(src)
-      : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
-        [mask] "f"(mask)
-      : "memory");
-}
-
-void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
-                              ptrdiff_t src_stride,
-                              int src_stepx,
-                              uint8_t* dst_argb,
-                              int dst_width) {
-  (void)src_stride;
-
-  uint64_t src0, src1, dest;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "lwc1       %[src0],          0x00(%[src_ptr])                \n\t"
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "lwc1       %[src1],          0x00(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest],          %[src0],          %[src1]       \n\t"
-
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[src_ptr],       %[src_ptr],       %[src_stepx_4]\n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x08          \n\t"
-      "daddi      %[width],         %[width],        -0x02          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
-        [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
-      : "memory");
-}
-
-void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
-                                 ptrdiff_t src_stride,
-                                 int src_stepx,
-                                 uint8_t* dst_argb,
-                                 int dst_width) {
-  const uint8_t* src0_ptr = src_argb;
-  const uint8_t* src1_ptr = src_argb + src_stride;
-
-  uint64_t src0, src1, src_hi, src_lo;
-  uint64_t dest, dest_hi, dest_lo, dest0, dest1;
-
-  const uint64_t mask = 0x0ULL;
-  const uint64_t ph = 0x0002000200020002ULL;
-  const uint64_t shift = 0x2ULL;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest0],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest0],        %[dest0],         %[ph]          \n\t"
-      "psrlh      %[dest0],        %[dest0],         %[shift]       \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-
-      "lwc1       %[src0],         0x00(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_lo],      %[src0],          %[mask]        \n\t"
-      "lwc1       %[src0],         0x04(%[src0_ptr])                \n\t"
-      "punpcklbh  %[dest_hi],      %[src0],          %[mask]        \n\t"
-
-      "lwc1       %[src1],         0x00(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_lo],       %[src1],          %[mask]        \n\t"
-      "lwc1       %[src1],         0x04(%[src1_ptr])                \n\t"
-      "punpcklbh  %[src_hi],       %[src1],          %[mask]        \n\t"
-      "paddh      %[dest_lo],      %[dest_lo],       %[src_lo]      \n\t"
-      "paddh      %[dest_hi],      %[dest_hi],       %[src_hi]      \n\t"
-      "paddh      %[dest1],        %[dest_hi],       %[dest_lo]     \n\t"
-      "paddh      %[dest1],        %[dest1],         %[ph]          \n\t"
-      "psrlh      %[dest1],        %[dest1],         %[shift]       \n\t"
-
-      "packushb   %[dest],         %[dest0],          %[dest1]      \n\t"
-      "gssdlc1    %[dest],         0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[dest],         0x00(%[dst_ptr])                 \n\t"
-
-      "dadd       %[src0_ptr],     %[src0_ptr],      %[src_stepx_4] \n\t"
-      "dadd       %[src1_ptr],     %[src1_ptr],      %[src_stepx_4] \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x08          \n\t"
-      "daddi      %[width],        %[width],         -0x02          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
-        [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
-        [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
-        [src1] "=&f"(src1), [dest] "=&f"(dest)
-      : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
-        [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
-        [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
-        [ph] "f"(ph)
-      : "memory");
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_MMI(uint8_t* dst_argb,
-                       const uint8_t* src_argb,
-                       int dst_width,
-                       int x,
-                       int dx) {
-  const uint32_t* src = (const uint32_t*)(src_argb);
-  uint32_t* dst = (uint32_t*)(dst_argb);
-
-  const uint32_t* src_tmp;
-
-  uint64_t dest, offset;
-
-  const uint64_t shift0 = 16;
-  const uint64_t shift1 = 2;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "srav       %[offset],        %[x],             %[shift0]     \n\t"
-      "sllv       %[offset],        %[offset],        %[shift1]     \n\t"
-      "dadd       %[src_tmp],       %[src_ptr],       %[offset]     \n\t"
-      "lwc1       %[dest],          0x00(%[src_tmp])                \n\t"
-      "swc1       %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      "dadd       %[x],             %[x],             %[dx]         \n\t"
-
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x04          \n\t"
-      "daddi      %[width],         %[width],        -0x01          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
-      : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
-        [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
-      : "memory");
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
-                          const uint8_t* src_argb,
-                          int dst_width,
-                          int x,
-                          int dx) {
-  uint64_t src, dest0, dest1;
-  (void)x;
-  (void)dx;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src],           0x00(%[src_ptr])                \n\t"
-      "gsldlc1    %[src],           0x07(%[src_ptr])                \n\t"
-      "punpcklwd  %[dest0],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest0],         0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest0],         0x00(%[dst_ptr])                \n\t"
-      "punpckhwd  %[dest1],         %[src],           %[src]        \n\t"
-      "gssdlc1    %[dest1],         0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest1],         0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src_ptr],       %[src_ptr],       0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x04          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
-      : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
-      : "memory");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVBaseTest.TestFixedDiv */
-int FixedDiv_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
-int FixedDiv1_MIPS(int num, int div) {
-  int quotient = 0;
-  const int shift = 16;
-  const int val1 = 1;
-  const int64_t val11 = 0x00010001ULL;
-
-  asm(
-      "dsll    %[num],     %[num],     %[shift]    \n\t"
-      "dsub    %[num],     %[num],     %[val11]    \n\t"
-      "dsub    %[div],     %[div],     %[val1]     \n\t"
-      "ddiv    %[num],     %[div]                  \t\n"
-      "mflo    %[quo]                              \t\n"
-      : [quo] "+&r"(quotient)
-      : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
-        [shift] "r"(shift));
-
-  return quotient;
-}
-
-// Read 8x2 upsample with filtering and write 16x1.
-// actually reads an extra pixel, so 9x2.
-void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
-                        ptrdiff_t src_stride,
-                        uint16_t* dst,
-                        int dst_width) {
-  const uint16_t* src2_ptr = src_ptr + src_stride;
-
-  uint64_t src0, src1;
-  uint64_t dest, dest04, dest15, dest26, dest37;
-  uint64_t tmp0, tmp1, tmp2, tmp3;
-
-  const uint64_t mask0 = 0x0003000900030009ULL;
-  const uint64_t mask1 = 0x0001000300010003ULL;
-  const uint64_t mask2 = 0x0009000300090003ULL;
-  const uint64_t mask3 = 0x0003000100030001ULL;
-  const uint64_t ph = 0x0000000800000008ULL;
-  const uint64_t shift = 4;
-
-  __asm__ volatile(
-      "1:                                                           \n\t"
-      "gsldrc1    %[src0],          0x00(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x07(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest04],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x00(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x07(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest04],        %[dest04],        %[dest]       \n\t"
-      "paddw      %[dest04],        %[dest04],        %[ph]         \n\t"
-      "psrlw      %[dest04],        %[dest04],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest15],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest15],        %[dest15],        %[dest]       \n\t"
-      "paddw      %[dest15],        %[dest15],        %[ph]         \n\t"
-      "psrlw      %[dest15],        %[dest15],        %[shift]      \n\t"
-
-      "gsldrc1    %[src0],          0x02(%[src1_ptr])               \n\t"
-      "gsldlc1    %[src0],          0x09(%[src1_ptr])               \n\t"
-      "pmaddhw    %[dest26],        %[src0],          %[mask0]      \n\t"
-      "gsldrc1    %[src1],          0x02(%[src2_ptr])               \n\t"
-      "gsldlc1    %[src1],          0x09(%[src2_ptr])               \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask1]      \n\t"
-      "paddw      %[dest26],        %[dest26],        %[dest]       \n\t"
-      "paddw      %[dest26],        %[dest26],        %[ph]         \n\t"
-      "psrlw      %[dest26],        %[dest26],        %[shift]      \n\t"
-
-      "pmaddhw    %[dest37],        %[src0],          %[mask2]      \n\t"
-      "pmaddhw    %[dest],          %[src1],          %[mask3]      \n\t"
-      "paddw      %[dest37],        %[dest37],        %[dest]       \n\t"
-      "paddw      %[dest37],        %[dest37],        %[ph]         \n\t"
-      "psrlw      %[dest37],        %[dest37],        %[shift]      \n\t"
-
-      /* tmp0 = ( 00 04 02 06 ) */
-      "packsswh   %[tmp0],          %[dest04],        %[dest26]     \n\t"
-      /* tmp1 = ( 01 05 03 07 ) */
-      "packsswh   %[tmp1],          %[dest15],        %[dest37]     \n\t"
-
-      /* tmp2 = ( 00 01 04 05 )*/
-      "punpcklhw  %[tmp2],          %[tmp0],          %[tmp1]       \n\t"
-      /* tmp3 = ( 02 03 06 07 )*/
-      "punpckhhw  %[tmp3],          %[tmp0],          %[tmp1]       \n\t"
-
-      /* ( 00 01 02 03 ) */
-      "punpcklwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x07(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x00(%[dst_ptr])                \n\t"
-
-      /* ( 04 05 06 07 ) */
-      "punpckhwd  %[dest],          %[tmp2],          %[tmp3]       \n\t"
-      "gssdlc1    %[dest],          0x0f(%[dst_ptr])                \n\t"
-      "gssdrc1    %[dest],          0x08(%[dst_ptr])                \n\t"
-
-      "daddiu     %[src1_ptr],      %[src1_ptr],      0x08          \n\t"
-      "daddiu     %[src2_ptr],      %[src2_ptr],      0x08          \n\t"
-      "daddiu     %[dst_ptr],       %[dst_ptr],       0x10          \n\t"
-      "daddi      %[width],         %[width],        -0x08          \n\t"
-      "bnez       %[width],         1b                              \n\t"
-      : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
-        [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
-        [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
-        [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
-      : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
-        [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
-        [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
-      : "memory");
-}
-
-void ScaleRowDown34_MMI(const uint8_t* src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t* dst,
-                      int dst_width) {
-  (void)src_stride;
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint64_t src[2];
-  uint64_t tmp[2];
-  __asm__ volatile (
-    "1:                                                           \n\t"
-    "gsldlc1    %[src0],         0x07(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src0],         0x00(%[src_ptr])                 \n\t"
-    "gsldlc1    %[src1],         0x0f(%[src_ptr])                 \n\t"
-    "gsldrc1    %[src1],         0x08(%[src_ptr])                 \n\t"
-    "and        %[tmp1],         %[src0],        %[mask1]         \n\t"
-    "psrlw      %[tmp0],         %[src0],        %[rmov]          \n\t"
-    "psllw      %[tmp0],         %[tmp0],        %[lmov1]         \n\t"
-    "or         %[src0],         %[tmp0],        %[tmp1]          \n\t"
-    "punpckhwd  %[tmp0],         %[src0],        %[src0]          \n\t"
-    "psllw      %[tmp1],         %[tmp0],        %[rmov]          \n\t"
-    "or         %[src0],         %[src0],        %[tmp1]          \n\t"
-    "psrlw      %[tmp0],         %[tmp0],        %[rmov8]         \n\t"
-    "pextrh     %[tmp0],         %[tmp0],        %[zero]          \n\t"
-    "pinsrh_2   %[src0],         %[src0],        %[tmp0]          \n\t"
-    "pextrh     %[tmp0],         %[src1],        %[zero]          \n\t"
-    "pinsrh_3   %[src0],         %[src0],        %[tmp0]          \n\t"
-
-    "punpckhwd  %[tmp0],         %[src1],        %[src1]          \n\t"
-    "pextrh     %[tmp1],         %[tmp0],        %[zero]          \n\t"
-    "psrlw      %[src1],         %[src1],        %[rmov]          \n\t"
-    "psllw      %[tmp1],         %[tmp1],        %[rmov8]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp1]          \n\t"
-    "and        %[tmp0],         %[tmp0],        %[mask2]         \n\t"
-    "or         %[src1],         %[src1],        %[tmp0]          \n\t"
-
-    "gssdlc1    %[src0],         0x07(%[dst_ptr])                 \n\t"
-    "gssdrc1    %[src0],         0x00(%[dst_ptr])                 \n\t"
-    "gsswlc1    %[src1],         0x0b(%[dst_ptr])                 \n\t"
-    "gsswrc1    %[src1],         0x08(%[dst_ptr])                 \n\t"
-
-    "daddiu     %[src_ptr],      %[src_ptr],     0x10             \n\t"
-    "daddi      %[width],        %[width],      -0x0c             \n\t"
-    "daddiu     %[dst_ptr],      %[dst_ptr],     0x0c             \n\t"
-    "bnez       %[width],        1b                               \n\t"
-
-    : [src0]"=&f"(src[0]),              [src1]"=&f"(src[1]),
-      [tmp0]"=&f"(tmp[0]),              [tmp1]"=&f"(tmp[1])
-    : [src_ptr]"r"(src_ptr),            [dst_ptr]"r"(dst),
-      [lmov]"f"(0xc),                   [rmov]"f"(0x18),
-      [mask1]"f"(0xffff0000ffff),       [rmov8]"f"(0x8),
-      [zero]"f"(0x0),                   [mask2]"f"(0xff000000),
-      [width]"r"(dst_width),            [lmov1]"f"(0x10)
-    : "memory"
-  );
-}
-// clang-format on
-
-#endif  // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc
index 65f986e93..1556071d0 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc
@@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
 #if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
@@ -510,6 +515,11 @@ static void ScaleUVBilinearUp(int src_width,
       InterpolateRow = InterpolateRow_LSX;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
 #endif
   if (src_width >= 32768) {
     ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc
deleted file mode 100644
index ea1f95c6c..000000000
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc
+++ /dev/null
@@ -1,1392 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for 32 bit Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
-    !defined(__clang__) && defined(_M_IX86)
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
-                              8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
-                              10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
-                               128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
-                               6,   8,   11,  14,  128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
-                              128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
-                               6,   7,   12,  13,  128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
-                                  65536 / 9, 65536 / 6, 0,         0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
-                               11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
-                               12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
-                               13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
-                                 65536 / 3, 65536 / 2, 0,         0};
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           uint8_t* dst_ptr,
-                                           int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8          // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x1 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
-                                                 ptrdiff_t src_stride,
-                                                 uint8_t* dst_ptr,
-                                                 int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5  // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5       // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_ptr,
-                                              int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5  // constant 0
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // vertical add
-    paddw      xmm1, xmm3
-    psrlw      xmm0, 1
-    psrlw      xmm1, 1
-    pavgw      xmm0, xmm5  // (x + 1) / 2
-    pavgw      xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-// Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
-                                          ptrdiff_t src_stride,
-                                          uint8_t* dst_ptr,
-                                          int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
-    vpsrlw      ymm1, ymm1, 8
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x1 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
-                                                ptrdiff_t src_stride,
-                                                uint8_t* dst_ptr,
-                                                int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    // src_stride
-    mov         edx, [esp + 12]  // dst_ptr
-    mov         ecx, [esp + 16]  // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5  // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// For rounding, average = (sum + 2) / 4
-// becomes average((sum >> 1), 0)
-// Blends 64x2 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
-                                             ptrdiff_t src_stride,
-                                             uint8_t* dst_ptr,
-                                             int dst_width) {
-  __asm {
-    push        esi
-    mov         eax, [esp + 4 + 4]  // src_ptr
-    mov         esi, [esp + 4 + 8]  // src_stride
-    mov         edx, [esp + 4 + 12]  // dst_ptr
-    mov         ecx, [esp + 4 + 16]  // dst_width
-
-    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
-    vpsrlw      ymm4, ymm4, 15
-    vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5  // constant 0
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // vertical add
-    vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
-    vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
-    vpavgw      ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vmovdqu     [edx], ymm0
-    lea         edx, [edx + 32]
-    sub         ecx, 32
-    jg          wloop
-
-    pop         esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-// Point samples 32 pixels to 8 pixels.
-__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
-                                           ptrdiff_t src_stride,
-                                           uint8_t* dst_ptr,
-                                           int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
-    psrld      xmm5, 24
-    pslld      xmm5, 16
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x4 rectangle to 8x1.
-__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_ptr,
-                                              int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]  // src_ptr
-    mov        esi, [esp + 8 + 8]  // src_stride
-    mov        edx, [esp + 8 + 12]  // dst_ptr
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4  // constant 0x0101
-    psrlw      xmm4, 15
-    movdqa     xmm5, xmm4
-    packuswb   xmm4, xmm4
-    psllw      xmm5, 3  // constant 0x0008
-
-  wloop:
-    movdqu     xmm0, [eax]  // average rows
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4  // horizontal add
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // vertical add rows 0, 1
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + esi * 2]
-    movdqu     xmm3, [eax + esi * 2 + 16]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // add row 2
-    paddw      xmm1, xmm3
-    movdqu     xmm2, [eax + edi]
-    movdqu     xmm3, [eax + edi + 16]
-    lea        eax, [eax + 32]
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2  // add row 3
-    paddw      xmm1, xmm3
-    phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5  // + 8 for round
-    psrlw      xmm0, 4  // /16 for average of 4 * 4
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    sub        ecx, 8
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-// Point samples 64 pixels to 16 pixels.
-__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
-                                          ptrdiff_t src_stride,
-                                          uint8_t* dst_ptr,
-                                          int dst_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov         edx, [esp + 12]  // dst_ptr
-    mov         ecx, [esp + 16]  // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
-    vpsrld      ymm5, ymm5, 24
-    vpslld      ymm5, ymm5, 16
-
-  wloop:
-    vmovdqu     ymm0, [eax]
-    vmovdqu     ymm1, [eax + 32]
-    lea         eax,  [eax + 64]
-    vpand       ymm0, ymm0, ymm5
-    vpand       ymm1, ymm1, ymm5
-    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vpsrlw      ymm0, ymm0, 8
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    vzeroupper
-    ret
-  }
-}
-
-// Blends 64x4 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
-                                             ptrdiff_t src_stride,
-                                             uint8_t* dst_ptr,
-                                             int dst_width) {
-  __asm {
-    push        esi
-    push        edi
-    mov         eax, [esp + 8 + 4]  // src_ptr
-    mov         esi, [esp + 8 + 8]  // src_stride
-    mov         edx, [esp + 8 + 12]  // dst_ptr
-    mov         ecx, [esp + 8 + 16]  // dst_width
-    lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
-    vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3  // constant 0x0008
-    vpackuswb   ymm4, ymm4, ymm4
-
-  wloop:
-    vmovdqu     ymm0, [eax]  // average rows
-    vmovdqu     ymm1, [eax + 32]
-    vmovdqu     ymm2, [eax + esi]
-    vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
-    vpmaddubsw  ymm1, ymm1, ymm4
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + esi * 2]
-    vmovdqu     ymm3, [eax + esi * 2 + 32]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // add row 2
-    vpaddw      ymm1, ymm1, ymm3
-    vmovdqu     ymm2, [eax + edi]
-    vmovdqu     ymm3, [eax + edi + 32]
-    lea         eax,  [eax + 64]
-    vpmaddubsw  ymm2, ymm2, ymm4
-    vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2  // add row 3
-    vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1  // mutates
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
-    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
-    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
-    vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 16
-    jg          wloop
-
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8_t* dst_ptr,
-                                            int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    movdqa     xmm3, xmmword ptr kShuf0
-    movdqa     xmm4, xmmword ptr kShuf1
-    movdqa     xmm5, xmmword ptr kShuf2
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm1
-    palignr    xmm1, xmm0, 8
-    pshufb     xmm0, xmm3
-    pshufb     xmm1, xmm4
-    pshufb     xmm2, xmm5
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + 8], xmm1
-    movq       qword ptr [edx + 16], xmm2
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]  // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]  // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]  // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShuf01
-    movdqa     xmm3, xmmword ptr kShuf11
-    movdqa     xmm4, xmmword ptr kShuf21
-    movdqa     xmm5, xmmword ptr kMadd01
-    movdqa     xmm6, xmmword ptr kMadd11
-    movdqa     xmm7, xmmword ptr kRound34
-
-  wloop:
-    movdqu     xmm0, [eax]  // pixels 0..7
-    movdqu     xmm1, [eax + esi]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]  // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]  // pixels 16..23
-    movdqu     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, xmmword ptr kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx+24]
-    sub        ecx, 24
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8_t* dst_ptr,
-                                            int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_ptr
-    mov        ecx, [esp + 16]  // dst_width
-    movdqa     xmm4, xmmword ptr kShuf38a
-    movdqa     xmm5, xmmword ptr kShuf38b
-
-  xloop:
-    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm4
-    pshufb     xmm1, xmm5
-    paddusb    xmm0, xmm1
-
-    movq       qword ptr [edx], xmm0       // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edx + 8], xmm1
-    lea        edx, [edx + 12]
-    sub        ecx, 12
-    jg         xloop
-
-    ret
-  }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShufAc
-    movdqa     xmm3, xmmword ptr kShufAc3
-    movdqa     xmm4, xmmword ptr kScaleAc33
-    pxor       xmm5, xmm5
-
-  xloop:
-    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
-    movdqu     xmm6, [eax + esi]
-    movhlps    xmm1, xmm0
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-    movdqu     xmm6, [eax + esi * 2]
-    lea        eax, [eax + 16]
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-
-    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    pshufb     xmm6, xmm2
-
-    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    pshufb     xmm7, xmm3
-    paddusw    xmm6, xmm7
-
-    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
-    packuswb   xmm6, xmm6
-
-    movd       [edx], xmm6  // write 6 pixels
-    psrlq      xmm6, 16
-    movd       [edx + 2], xmm6
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
-                                                  ptrdiff_t src_stride,
-                                                  uint8_t* dst_ptr,
-                                                  int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_ptr
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_ptr
-    mov        ecx, [esp + 4 + 16]  // dst_width
-    movdqa     xmm2, xmmword ptr kShufAb0
-    movdqa     xmm3, xmmword ptr kShufAb1
-    movdqa     xmm4, xmmword ptr kShufAb2
-    movdqa     xmm5, xmmword ptr kScaleAb2
-
-  xloop:
-    movdqu     xmm0, [eax]  // average 2 rows into xmm0
-    movdqu     xmm1, [eax + esi]
-    lea        eax, [eax + 16]
-    pavgb      xmm0, xmm1
-
-    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
-    pshufb     xmm1, xmm2
-    movdqa     xmm6, xmm0
-    pshufb     xmm6, xmm3
-    paddusw    xmm1, xmm6
-    pshufb     xmm0, xmm4
-    paddusw    xmm1, xmm0
-
-    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
-    packuswb   xmm1, xmm1
-
-    movd       [edx], xmm1  // write 6 pixels
-    psrlq      xmm1, 16
-    movd       [edx + 2], xmm1
-    lea        edx, [edx + 6]
-    sub        ecx, 6
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
-                                        uint16_t* dst_ptr,
-                                        int src_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_ptr
-    mov        edx, [esp + 8]  // dst_ptr
-    mov        ecx, [esp + 12]  // src_width
-    pxor       xmm5, xmm5
-
-        // sum rows
-  xloop:
-    movdqu     xmm3, [eax]  // read 16 bytes
-    lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]  // read 16 words from destination
-    movdqu     xmm1, [edx + 16]
-    movdqa     xmm2, xmm3
-    punpcklbw  xmm2, xmm5
-    punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2  // sum 16 words
-    paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0  // write 16 words to destination
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 16
-    jg         xloop
-    ret
-  }
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
-                                        uint16_t* dst_ptr,
-                                        int src_width) {
-  __asm {
-    mov         eax, [esp + 4]  // src_ptr
-    mov         edx, [esp + 8]  // dst_ptr
-    mov         ecx, [esp + 12]  // src_width
-    vpxor       ymm5, ymm5, ymm5
-
-        // sum rows
-  xloop:
-    vmovdqu     ymm3, [eax]  // read 32 bytes
-    lea         eax, [eax + 32]
-    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
-    vpunpcklbw  ymm2, ymm3, ymm5
-    vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
-    vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0  // write 32 words to destination
-    vmovdqu     [edx + 32], ymm1
-    lea         edx, [edx + 64]
-    sub         ecx, 32
-    jg          xloop
-
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
-                               0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
-                                             const uint8_t* src_ptr,
-                                             int dst_width,
-                                             int x,
-                                             int dx) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        edi, [esp + 12 + 4]  // dst_ptr
-    mov        esi, [esp + 12 + 8]  // src_ptr
-    mov        ecx, [esp + 12 + 12]  // dst_width
-    movd       xmm2, [esp + 12 + 16]  // x
-    movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
-    movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pcmpeqb    xmm7, xmm7  // generate 0x0001
-    psrlw      xmm7, 15
-    pextrw     eax, xmm2, 1  // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2  // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0  // x0 x1
-    punpckldq  xmm3, xmm3  // dx dx
-    paddd      xmm3, xmm3  // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3  // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2  // x0, x1 fractions.
-    paddd      xmm2, xmm3  // x += dx
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm1, 9  // 7 bit fractions.
-    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
-    movd       xmm4, ebx
-    pshufb     xmm1, xmm5  // 0011
-    punpcklwd  xmm0, xmm4
-    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm1, xmm6  // 0..7f and 7f..0
-    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
-    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
-    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
-    movd       ebx, xmm1
-    mov        [edi], bx
-    lea        edi, [edi + 2]
-    sub        ecx, 2  // 2 pixels
-    jge        xloop2
-
- xloop29:
-    add        ecx, 2 - 1
-    jl         xloop99
-
-            // 1 pixel remainder
-    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-    movd       xmm0, ebx
-    psrlw      xmm2, 9  // 7 bit fractions.
-    pshufb     xmm2, xmm5  // 0011
-    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm2, xmm6  // 0..7f and 7f..0
-    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
-    pmaddubsw  xmm2, xmm0  // 16 bit
-    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
-    packuswb   xmm2, xmm2  // 8 bits
-    movd       ebx, xmm2
-    mov        [edi], bl
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
-                                         const uint8_t* src_ptr,
-                                         int dst_width,
-                                         int x,
-                                         int dx) {
-  __asm {
-    mov        edx, [esp + 4]  // dst_ptr
-    mov        eax, [esp + 8]  // src_ptr
-    mov        ecx, [esp + 12]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0
-    punpckhbw  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 32
-    jg         wloop
-
-    ret
-  }
-}
-
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
-                                              ptrdiff_t src_stride,
-                                              uint8_t* dst_argb,
-                                              int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_argb
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0xdd
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x1 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
-                                                    ptrdiff_t src_stride,
-                                                    uint8_t* dst_argb,
-                                                    int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]  // src_argb
-    // src_stride ignored
-    mov        edx, [esp + 12]  // dst_argb
-    mov        ecx, [esp + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd       // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x2 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
-                                                 ptrdiff_t src_stride,
-                                                 uint8_t* dst_argb,
-                                                 int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]  // src_argb
-    mov        esi, [esp + 4 + 8]  // src_stride
-    mov        edx, [esp + 4 + 12]  // dst_argb
-    mov        ecx, [esp + 4 + 16]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2  // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd  // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels at a time.
-__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
-                                                 ptrdiff_t src_stride,
-                                                 int src_stepx,
-                                                 uint8_t* dst_argb,
-                                                 int dst_width) {
-  __asm {
-    push       ebx
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    // src_stride ignored
-    mov        ebx, [esp + 8 + 12]  // src_stepx
-    mov        edx, [esp + 8 + 16]  // dst_argb
-    mov        ecx, [esp + 8 + 20]  // dst_width
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movd       xmm0, [eax]
-    movd       xmm1, [eax + ebx]
-    punpckldq  xmm0, xmm1
-    movd       xmm2, [eax + ebx * 2]
-    movd       xmm3, [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        ebx
-    ret
-  }
-}
-
-// Blends four 2x2 to 4x1.
-__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
-                                                    ptrdiff_t src_stride,
-                                                    int src_stepx,
-                                                    uint8_t* dst_argb,
-                                                    int dst_width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]  // src_argb
-    mov        esi, [esp + 12 + 8]  // src_stride
-    mov        ebx, [esp + 12 + 12]  // src_stepx
-    mov        edx, [esp + 12 + 16]  // dst_argb
-    mov        ecx, [esp + 12 + 20]  // dst_width
-    lea        esi, [eax + esi]  // row1 pointer
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-  wloop:
-    movq       xmm0, qword ptr [eax]  // row0 4 pairs
-    movhps     xmm0, qword ptr [eax + ebx]
-    movq       xmm1, qword ptr [eax + ebx * 2]
-    movhps     xmm1, qword ptr [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    movq       xmm2, qword ptr [esi]  // row1 4 pairs
-    movhps     xmm2, qword ptr [esi + ebx]
-    movq       xmm3, qword ptr [esi + ebx * 2]
-    movhps     xmm3, qword ptr [esi + edi]
-    lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2  // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd  // odd pixels
-    pavgb      xmm0, xmm2
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Column scaling unfiltered. SSE2 version.
-__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
-                                          const uint8_t* src_argb,
-                                          int dst_width,
-                                          int x,
-                                          int dx) {
-  __asm {
-    push       edi
-    push       esi
-    mov        edi, [esp + 8 + 4]  // dst_argb
-    mov        esi, [esp + 8 + 8]  // src_argb
-    mov        ecx, [esp + 8 + 12]  // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-
-    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
-    paddd      xmm2, xmm0
-    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0  // x3 x2 x1 x0
-    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
-
-    pextrw     eax, xmm2, 1  // get x0 integer.
-    pextrw     edx, xmm2, 3  // get x1 integer.
-
-    cmp        ecx, 0
-    jle        xloop99
-    sub        ecx, 4
-    jl         xloop49
-
-        // 4 Pixel loop.
- xloop4:
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5  // get x2 integer.
-    pextrw     edx, xmm2, 7  // get x3 integer.
-    paddd      xmm2, xmm3  // x += dx
-    punpckldq  xmm0, xmm1  // x0 x1
-
-    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
-    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4  // x2 x3
-    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
-    movdqu     [edi], xmm0
-    lea        edi, [edi + 16]
-    sub        ecx, 4  // 4 pixels
-    jge        xloop4
-
- xloop49:
-    test       ecx, 2
-    je         xloop29
-
-        // 2 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5  // get x2 integer.
-    punpckldq  xmm0, xmm1  // x0 x1
-
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-
- xloop29:
-    test       ecx, 1
-    je         xloop99
-
-        // 1 Pixels.
-    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
-    movd       dword ptr [edi], xmm0
- xloop99:
-
-    pop        esi
-    pop        edi
-    ret
-  }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
-    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
-    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
-    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
-                                                 const uint8_t* src_argb,
-                                                 int dst_width,
-                                                 int x,
-                                                 int dx) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]  // dst_argb
-    mov        esi, [esp + 8 + 8]  // src_argb
-    mov        ecx, [esp + 8 + 12]  // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-    movdqa     xmm4, xmmword ptr kShuffleColARGB
-    movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1  // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2  // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0  // x0 x1
-    punpckldq  xmm3, xmm3  // dx dx
-    paddd      xmm3, xmm3  // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3  // get x1 integer. preroll
-
-    // 2 Pixel loop.
-  xloop2:
-    movdqa     xmm1, xmm2  // x0, x1 fractions.
-    paddd      xmm2, xmm3  // x += dx
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9  // 7 bit fractions.
-    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5  // 0000000011111111
-    pshufb     xmm0, xmm4  // arrange pixels into pairs
-    pxor       xmm1, xmm6  // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
-    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-    sub        ecx, 2  // 2 pixels
-    jge        xloop2
-
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-            // 1 pixel remainder
-    psrlw      xmm2, 9  // 7 bit fractions.
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5  // 00000000
-    pshufb     xmm0, xmm4  // arrange pixels into pairs
-    pxor       xmm2, xmm6  // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
-    movd       [edi], xmm0
-
- xloop99:
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
-                                             const uint8_t* src_argb,
-                                             int dst_width,
-                                             int x,
-                                             int dx) {
-  __asm {
-    mov        edx, [esp + 4]  // dst_argb
-    mov        eax, [esp + 8]  // src_argb
-    mov        ecx, [esp + 12]  // dst_width
-
-  wloop:
-    movdqu     xmm0, [eax]
-    lea        eax,  [eax + 16]
-    movdqa     xmm1, xmm0
-    punpckldq  xmm0, xmm0
-    punpckhdq  xmm1, xmm1
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         wloop
-
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) int FixedDiv_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]  // num
-    cdq  // extend num to 64 bits
-    shld       edx, eax, 16  // 32.16
-    shl        eax, 16
-    idiv       dword ptr [esp + 8]
-    ret
-  }
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) int FixedDiv1_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]  // num
-    mov        ecx, [esp + 8]  // denom
-    cdq  // extend num to 64 bits
-    shld       edx, eax, 16  // 32.16
-    shl        eax, 16
-    sub        eax, 0x00010001
-    sbb        edx, 0
-    sub        ecx, 1
-    idiv       ecx
-    ret
-  }
-}
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp b/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp
index 1cf07ebf1..c3a45a2a5 100644
--- a/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp
+++ b/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp
@@ -373,6 +373,7 @@ namespace
     std::make_pair("TSO2", "ALBUMARTISTSORT"), // non-standard, used by iTunes
     std::make_pair("TSRC", "ISRC"),
     std::make_pair("TSSE", "ENCODING"),
+    std::make_pair("TSST", "DISCSUBTITLE"),
     // URL frames
     std::make_pair("WCOP", "COPYRIGHTURL"),
     std::make_pair("WOAF", "FILEWEBPAGE"),
diff --git a/libfenrir/src/main/jni/audio/taglib/taglib_config.h b/libfenrir/src/main/jni/audio/taglib/taglib_config.h
index fa5c4f34a..98fb804ce 100644
--- a/libfenrir/src/main/jni/audio/taglib/taglib_config.h
+++ b/libfenrir/src/main/jni/audio/taglib/taglib_config.h
@@ -7,4 +7,6 @@
 
 /* Defined if your compiler supports some safer version of vsprintf */
 #define   HAVE_VSNPRINTF 1
+
+#define HAVE_ZLIB 1
 #endif
diff --git a/libfenrir/src/main/jni/thorvg/inc/thorvg.h b/libfenrir/src/main/jni/thorvg/inc/thorvg.h
index 97990a8e4..a8f1f8296 100644
--- a/libfenrir/src/main/jni/thorvg/inc/thorvg.h
+++ b/libfenrir/src/main/jni/thorvg/inc/thorvg.h
@@ -161,19 +161,34 @@ enum class FillRule
     EvenOdd      ///< A line from the point to a location outside the shape is drawn and its intersections with the path segments of the shape are counted. If the number of intersections is an odd number, the point is inside the shape.
 };
 
+
 /**
  * @brief Enumeration indicating the method used in the composition of two objects - the target and the source.
+ *
+ * In the case of Mask composition, you need to perform bit operations on two options - Mask Alpha and Mask Operation.
+ * Mask Alpha specifies the origin of the alpha channel, while Mask Operation specifies the masking operation.
+ * @code paint->composite(tvg::CompositeMethod::AlphaMask + tvg::CompositeMethod::AddMaskOp); @endcode
+ *
+ * @note If you don't specify the mask alpha, @c AlphaMask will be used.
+ * @note If you don't specify the mask method, @c AddMaskOp will be used.
+ * @warning Composition does not support multiple choices for both Mask Alpha and Mask Operation.
+ * @see Paint::composite()
  */
 enum class CompositeMethod
 {
-    None = 0,     ///< No composition is applied.
-    ClipPath,     ///< The intersection of the source and the target is determined and only the resulting pixels from the source are rendered.
-    AlphaMask,    ///< The pixels of the source and the target are alpha blended. As a result, only the part of the source, which alpha intersects with the target is visible.
-    InvAlphaMask, ///< The pixels of the source and the complement to the target's pixels are alpha blended. As a result, only the part of the source which alpha is not covered by the target is visible.
-    LumaMask,     ///< The source pixels are converted to the grayscale (luma value) and alpha blended with the target. As a result, only the part of the source, which intersects with the target is visible. @since 0.9
-    InvLumaMask   ///< The source pixels are converted to the grayscale (luma value) and the complement to the target's pixels are alpha blended. As a result, only the part of the source which grayscale is not covered by the target is visible. @BETA_API
+    None = 0,           ///< No composition is applied.
+    ClipPath,           ///< The intersection of the source and the target is determined and only the resulting pixels from the source are rendered.
+    AlphaMask,          ///< Mask Alpha: Use the compositing target's pixels as an alpha value.
+    InvAlphaMask,       ///< Mask Alpha: Use the complement to the compositing target's pixels as an alpha.
+    LumaMask,           ///< Mask Alpha: Use the grayscale (0.2125R + 0.7154G + 0.0721*B) of the compositing target's pixels. @since 0.9
+    InvLumaMask,        ///< Mask Alpha: Use the grayscale (0.2125R + 0.7154G + 0.0721*B) of the complement to the compositing target's pixels. @BETA_API
+    AddMask,            ///< Mask Operation: Combines the source and target pixels using Mask Alpha. @BETA_API
+    SubtractMask,       ///< Mask Operation: Subtracts the target color from the source color while considering their respective Mask Alpha. @BETA_API
+    IntersectMask,      ///< Mask Operation: Computes the result by taking the minimum value between the Mask Alpha and the target alpha and multiplies it with the source color. @BETA_API
+    DifferenceMask      ///< Mask Operation: Calculates the absolute difference between the source color and the target color multiplied by the complement of the Mask Alpha. @BETA_API
 };
 
+
 /**
  * @brief Enumeration specifying the engine type used for the graphics backend. For multiple backends bitwise operation is allowed.
  */
@@ -1352,8 +1367,8 @@ class TVG_API SwCanvas final : public Canvas
      */
     enum Colorspace
     {
-        ABGR8888 = 0,      ///< The channels are joined in the order: alpha, blue, green, red. Colors are alpha-premultiplied.
-        ARGB8888,          ///< The channels are joined in the order: alpha, red, green, blue. Colors are alpha-premultiplied.
+        ABGR8888 = 0,      ///< The channels are joined in the order: alpha, blue, green, red. Colors are alpha-premultiplied. (a << 24 | b << 16 | g << 8 | r)
+        ARGB8888,          ///< The channels are joined in the order: alpha, red, green, blue. Colors are alpha-premultiplied. (a << 24 | r << 16 | g << 8 | b)
         ABGR8888S,         ///< @BETA_API The channels are joined in the order: alpha, blue, green, red. Colors are un-alpha-premultiplied.
         ARGB8888S,         ///< @BETA_API The channels are joined in the order: alpha, red, green, blue. Colors are un-alpha-premultiplied.
     };
@@ -1636,6 +1651,7 @@ std::unique_ptr<T> cast(Paint* paint)
     return std::unique_ptr<T>(static_cast<T*>(paint));
 }
 
+
 /**
  * @brief The cast() function is a utility function used to cast a 'Fill' to type 'T'.
  *
@@ -1648,6 +1664,17 @@ std::unique_ptr<T> cast(Fill* fill)
 }
 
 
+/**
+ * @brief The operator() function is the OR function used to combine Mask Alpha & Mask Operation
+ *
+ * @BETA_API
+ */
+constexpr CompositeMethod operator+(CompositeMethod a, CompositeMethod b)
+{
+    return CompositeMethod(int(a) | int(b));
+}
+
+
 /** @}*/
 
 } //namespace
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h
index 5977e9321..0181c8c05 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h
@@ -248,7 +248,8 @@ struct SwBlender
 
     SwAlpha alpha(CompositeMethod method)
     {
-        return alphas[(int)(method) - 2];      //0: None, 1: ClipPath
+        auto idx = (int)(method) - 2;       //0: None, 1: ClipPath
+        return alphas[idx > 3 ? 0 : idx];   //CompositeMethod has only four Matting methods.
     }
 };
 
@@ -287,9 +288,14 @@ static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a)
             ((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff));
 }
 
-static inline uint32_t INTERPOLATE(uint32_t a, uint32_t c0, uint32_t c1)
+static inline uint32_t INTERPOLATE(uint32_t s, uint32_t d, uint8_t a)
 {
-    return (((((((c0 >> 8) & 0xff00ff) - ((c1 >> 8) & 0xff00ff)) * a) + (c1 & 0xff00ff00)) & 0xff00ff00) + ((((((c0 & 0xff00ff) - (c1 & 0xff00ff)) * a) >> 8) + (c1 & 0xff00ff)) & 0xff00ff));
+    return (((((((s >> 8) & 0xff00ff) - ((d >> 8) & 0xff00ff)) * a) + (d & 0xff00ff00)) & 0xff00ff00) + ((((((s & 0xff00ff) - (d & 0xff00ff)) * a) >> 8) + (d & 0xff00ff)) & 0xff00ff));
+}
+
+static inline uint8_t INTERPOLATE8(uint8_t s, uint8_t d, uint8_t a)
+{
+    return ((s * a + 0xff) >> 8) + ((d * ~a + 0xff) >> 8);
 }
 
 static inline SwCoord HALF_STROKE(float width)
@@ -297,6 +303,61 @@ static inline SwCoord HALF_STROKE(float width)
     return TO_SWCOORD(width * 0.5f);
 }
 
+static inline uint8_t MULTIPLY(uint8_t c, uint8_t a)
+{
+    return ((c * a + 0xff) >> 8);
+}
+
+static inline uint8_t ALPHA(uint32_t c)
+{
+    return (c >> 24);
+}
+
+static inline uint8_t IALPHA(uint32_t c)
+{
+    return (~c >> 24);
+}
+
+
+typedef uint32_t(*SwBlendOp)(uint32_t s, uint32_t d, uint8_t a);            //src, dst, alpha
+
+static inline uint32_t opAlphaBlend(uint32_t s, uint32_t d, uint8_t a)
+{
+    auto t = ALPHA_BLEND(s, a);
+    return t + ALPHA_BLEND(d, IALPHA(t));
+}
+
+static inline uint32_t opBlend(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s + ALPHA_BLEND(d, IALPHA(s));
+}
+
+static inline uint32_t opAddMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return opBlend(s, d, a);
+}
+
+static inline uint32_t opSubMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return ALPHA_BLEND(d, IALPHA(s));
+}
+
+static inline uint32_t opIntMask(TVG_UNUSED uint32_t s, uint32_t d, uint8_t a)
+{
+   return ALPHA_BLEND(d, a);
+}
+
+static inline uint32_t opDifMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+   return ALPHA_BLEND(s, IALPHA(d)) + ALPHA_BLEND(d, IALPHA(s));
+}
+
+static inline uint32_t opInterpolate(uint32_t s, uint32_t d, uint8_t a)
+{
+    return INTERPOLATE(s, d, a);
+}
+
+
 int64_t mathMultiply(int64_t a, int64_t b);
 int64_t mathDivide(int64_t a, int64_t b);
 int64_t mathMulDiv(int64_t a, int64_t b, int64_t c);
@@ -344,8 +405,10 @@ void imageFree(SwImage* image);
 bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix* transform, SwSurface* surface, uint32_t opacity, bool ctable);
 void fillReset(SwFill* fill);
 void fillFree(SwFill* fill);
-void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len);
-void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len);
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a);                                         //blending ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //masking ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a);                                         //blending ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //masking ver.
 
 SwRleData* rleRender(SwRleData* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias);
 SwRleData* rleRender(const SwBBox* bbox);
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp
index 694bc3523..1a432ea86 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp
@@ -80,7 +80,7 @@ static bool _updateColorTable(SwFill* fill, const Fill* fdata, const SwSurface*
             auto dist = static_cast<int32_t>(255 * t);
             auto dist2 = 255 - dist;
 
-            auto color = INTERPOLATE(dist2, rgba, rgba2);
+            auto color = INTERPOLATE(rgba, rgba2, dist2);
             fill->ctable[i] = ALPHA_BLEND((color | 0xff000000), (color >> 24));
 
             ++i;
@@ -233,7 +233,7 @@ static inline uint32_t _pixel(const SwFill* fill, float pos)
 /* External Class Implementation                                        */
 /************************************************************************/
 
-void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
 {
     auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX;
     auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY;
@@ -244,16 +244,124 @@ void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
     auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative;
     auto det = rx * rx + ry * ry;
 
-    for (uint32_t i = 0 ; i < len ; ++i) {
-        *dst = _pixel(fill, sqrtf(det));
-        ++dst;
-        det += detFirstDerivative;
-        detFirstDerivative += detSecondDerivative;
+    if (opacity == 255) {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+            *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, alpha(cmp));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    } else {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+            *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, MULTIPLY(opacity, alpha(cmp)));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
+{
+    auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX;
+    auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY;
+
+    // detSecondDerivative = d(detFirstDerivative)/dx = d( d(det)/dx )/dx
+    auto detSecondDerivative = fill->radial.detSecDeriv;
+    // detFirstDerivative = d(det)/dx
+    auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative;
+    auto det = rx * rx + ry * ry;
+
+    if (op) {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            *dst = op(_pixel(fill, sqrtf(det)), *dst, a);
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    } else {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            *dst = _pixel(fill, sqrtf(det));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
     }
 }
 
 
-void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (opacity == 255) {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(color, *dst, alpha(cmp));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, alpha(cmp));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, alpha(cmp));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    } else {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(color, *dst, MULTIPLY(alpha(cmp), opacity));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, MULTIPLY(alpha(cmp), opacity));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, MULTIPLY(opacity, alpha(cmp)));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
 {
     //Rotation
     float rx = x + 0.5f;
@@ -263,7 +371,13 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
 
     if (mathZero(inc)) {
         auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
-        rasterRGBA32(dst, color, 0, len);
+        if (op) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                *dst = op(color, *dst, a);
+            }
+        } else {
+            rasterRGBA32(dst, color, 0, len);
+        }
         return;
     }
 
@@ -271,22 +385,41 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
     auto vMin = -vMax;
     auto v = t + (inc * len);
 
-    //we can use fixed point math
-    if (v < vMax && v > vMin) {
-        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
-        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
-        for (uint32_t j = 0; j < len; ++j) {
-            *dst = _fixedPixel(fill, t2);
-            ++dst;
-            t2 += inc2;
+    if (op) {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                *dst = op(_fixedPixel(fill, t2), *dst, a);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, a);
+                ++dst;
+                t += inc;
+            }
         }
-    //we have to fallback to float math
     } else {
-        uint32_t counter = 0;
-        while (counter++ < len) {
-            *dst = _pixel(fill, t / GRADIENT_STOP_SIZE);
-            ++dst;
-            t += inc;
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                *dst = _fixedPixel(fill, t2);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = _pixel(fill, t / GRADIENT_STOP_SIZE);
+                ++dst;
+                t += inc;
+            }
         }
     }
 }
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
index ef350101d..a94bd52f5 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
@@ -37,30 +37,53 @@
 /************************************************************************/
 constexpr auto DOWN_SCALE_TOLERANCE = 0.5f;
 
-template<typename T>
-static inline T _multiply(T c, T a)
+struct FillLinear
 {
-    return ((c * a + 0xff) >> 8);
-}
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
+    {
+        fillLinear(fill, dst, y, x, len, op, a);
+    }
 
-static inline uint32_t _alpha(uint32_t c)
-{
-    return (c >> 24);
-}
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+    {
+        fillLinear(fill, dst, y, x, len, nullptr, 255);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+    {
+        fillLinear(fill, dst, y, x, len, cmp, alpha, csize, opacity);
+    }
+};
 
-static inline uint32_t _ialpha(uint32_t c)
+struct FillRadial
 {
-    return (~c >> 24);
-}
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
+    {
+        fillRadial(fill, dst, y, x, len, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+    {
+        fillRadial(fill, dst, y, x, len, nullptr, 255);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+    {
+        fillRadial(fill, dst, y, x, len, cmp, alpha, csize, opacity);
+    }
+};
+
 
+static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region,  uint32_t opacity = 255);
 
-static inline uint8_t _alpha(uint8_t* a)
+
+static inline uint8_t ALPHA(uint8_t* a)
 {
     return *a;
 }
 
 
-static inline uint8_t _ialpha(uint8_t* a)
+static inline uint8_t IALPHA(uint8_t* a)
 {
     return ~(*a);
 }
@@ -104,12 +127,6 @@ static inline uint32_t _argbJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 }
 
 
-#include "tvgSwRasterTexmap.h"
-#include "tvgSwRasterC.h"
-#include "tvgSwRasterAvx.h"
-#include "tvgSwRasterNeon.h"
-
-
 static inline bool _compositing(const SwSurface* surface)
 {
     if (!surface->compositor || (int)surface->compositor->method <= (int)CompositeMethod::ClipPath) return false;
@@ -117,6 +134,81 @@ static inline bool _compositing(const SwSurface* surface)
 }
 
 
+static inline bool _matting(const SwSurface* surface)
+{
+    if ((int)surface->compositor->method < (int)CompositeMethod::AddMask) return true;
+    else return false;
+}
+
+
+static inline bool _masking(const SwSurface* surface)
+{
+    if ((int)surface->compositor->method >= (int)CompositeMethod::AddMask) return true;
+    else return false;
+}
+
+
+struct AddMaskOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        return s + ALPHA_BLEND(d, a);
+    }
+};
+
+
+struct SubMaskOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        return ALPHA_BLEND(d, a);
+    }
+};
+
+
+struct DifMaskOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        return ALPHA_BLEND(s, IALPHA(d)) + ALPHA_BLEND(d, a);
+    }
+};
+
+
+struct AddMaskAOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        return INTERPOLATE(s, d, a);
+    }
+};
+
+
+struct SubMaskAOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        return ALPHA_BLEND(d, IALPHA(ALPHA_BLEND(s, a)));
+    }
+};
+
+
+struct DifMaskAOp
+{
+    uint32_t operator()(uint32_t s, uint32_t d, uint8_t a)
+    {
+        auto t = ALPHA_BLEND(s, a);
+        return ALPHA_BLEND(t, IALPHA(d)) + ALPHA_BLEND(d, IALPHA(t));
+    }
+};
+
+
+#include "tvgSwRasterTexmap.h"
+#include "tvgSwRasterC.h"
+#include "tvgSwRasterAvx.h"
+#include "tvgSwRasterNeon.h"
+
+
 static inline uint32_t _halfScale(float scale)
 {
     auto halfScale = static_cast<uint32_t>(0.5f / scale);
@@ -125,7 +217,7 @@ static inline uint32_t _halfScale(float scale)
 }
 
 //Bilinear Interpolation
-static uint32_t _interpUpScaler(const uint32_t *img, uint32_t w, uint32_t h, float sx, float sy)
+static uint32_t _interpUpScaler(const uint32_t *img, TVG_UNUSED uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, TVG_UNUSED uint32_t n)
 {
     auto rx = (uint32_t)(sx);
     auto ry = (uint32_t)(sy);
@@ -142,13 +234,15 @@ static uint32_t _interpUpScaler(const uint32_t *img, uint32_t w, uint32_t h, flo
     auto c3 = img[rx2 + ry2 * w];
     auto c4 = img[rx + ry2 * w];
 
-    return INTERPOLATE(dy, INTERPOLATE(dx, c3, c4), INTERPOLATE(dx, c2, c1));
+    return INTERPOLATE(INTERPOLATE(c3, c4, dx), INTERPOLATE(c2, c1, dx), dy);
 }
 
 
 //2n x 2n Mean Kernel
-static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t w, uint32_t h, uint32_t rx, uint32_t ry, uint32_t n)
+static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, uint32_t n)
 {
+    uint32_t rx = sx;
+    uint32_t ry = sy;
     uint32_t c[4] = {0, 0, 0, 0};
     auto n2 = n * n;
     auto src = img + rx - n + (ry - n) * stride;
@@ -174,21 +268,96 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t
 
 void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len)
 {
-    cRasterPixels<uint8_t>(dst, val, offset, len);
+    cRasterPixels(dst, val, offset, len);
 }
 
 /************************************************************************/
 /* Rect                                                                 */
 /************************************************************************/
 
-static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a, SwAlpha alpha)
+template<typename maskOp>
+static void _rasterMaskedRectDup(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto cbuffer = surface->compositor->image.buf32 + (region.min.y * surface->compositor->image.stride + region.min.x);   //compositor buffer
+    auto cstride = surface->compositor->image.stride;
+    auto color = surface->blender.join(r, g, b, a);
+    auto ialpha = 255 - a;
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto cmp = cbuffer;
+        for (uint32_t x = 0; x < w; ++x, ++cmp) {
+            *cmp = maskOp()(color, *cmp, ialpha);
+        }
+        cbuffer += cstride;
+    }
+}
+
+
+static void _rasterMaskedRectInt(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto cstride = surface->compositor->image.stride;
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        auto cmp = surface->compositor->image.buf32 + (y * cstride + surface->compositor->bbox.min.x);
+        if (y == region.min.y) {
+            for (uint32_t y2 = y; y2 < region.max.y; ++y2) {
+                auto tmp = cmp;
+                auto x = surface->compositor->bbox.min.x;
+                while (x < surface->compositor->bbox.max.x) {
+                    if (x == region.min.x) {
+                        for (uint32_t i = 0; i < w; ++i, ++tmp) {
+                            *tmp = ALPHA_BLEND(*tmp, a);
+                        }
+                        x += w;
+                    } else {
+                        *tmp = 0;
+                        ++tmp;
+                        ++x;
+                    }
+                }
+                cmp += cstride;
+            }
+            y += (h - 1);
+        } else {
+            rasterRGBA32(cmp, 0x00000000, 0, w);
+            cmp += cstride;
+        }
+    }
+}
+
+
+static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    //32bit channels composition
+    if (surface->channelSize != sizeof(uint32_t)) return false;
+
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Masked(%d) Rect [Region: %lu %lu %lu %lu]", (int)method, region.min.x, region.min.y, region.max.x - region.max.y, region.min.y);
+    if (method == CompositeMethod::AddMask) _rasterMaskedRectDup<AddMaskOp>(surface, region, r, g, b, a);
+    else if (method == CompositeMethod::SubtractMask) _rasterMaskedRectDup<SubMaskOp>(surface, region, r, g, b, a);
+    else if (method == CompositeMethod::DifferenceMask) _rasterMaskedRectDup<DifMaskOp>(surface, region, r, g, b, a);
+    else if (method == CompositeMethod::IntersectMask) _rasterMaskedRectInt(surface, region, r, g, b, a);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterMattedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 {
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8 + ((region.min.y * surface->compositor->image.stride + region.min.x) * csize);   //compositor buffer
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    TVGLOG("SW_ENGINE", "Masked Rect [Region: %lu %lu %u %u]", region.min.x, region.min.y, w, h);
+    TVGLOG("SW_ENGINE", "Matted(%d) Rect [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
     
     //32bits channels
     if (surface->channelSize == sizeof(uint32_t)) {
@@ -196,10 +365,9 @@ static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t
         auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
         for (uint32_t y = 0; y < h; ++y) {
             auto dst = &buffer[y * surface->stride];
-            auto cmp = &cbuffer[y * surface->stride * csize];
+            auto cmp = &cbuffer[y * surface->compositor->image.stride * csize];
             for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) {
-                auto tmp = ALPHA_BLEND(color, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+                *dst = INTERPOLATE(color, *dst, alpha(cmp));
             }
         }
     //8bits grayscale
@@ -207,10 +375,9 @@ static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t
         auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
         for (uint32_t y = 0; y < h; ++y) {
             auto dst = &buffer[y * surface->stride];
-            auto cmp = &cbuffer[y * surface->stride * csize];
+            auto cmp = &cbuffer[y * surface->compositor->image.stride * csize];
             for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) {
-                auto tmp = _multiply<uint8_t>(a, alpha(cmp));
-                *dst = tmp + _multiply<uint8_t>(*dst, _ialpha(tmp));
+                *dst = INTERPOLATE8(a, *dst, alpha(cmp));
             }
         }
     }
@@ -230,22 +397,25 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r
         for (uint32_t y = 0; y < h; ++y) {
             rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w);
         }
+        return true;
     //8bits grayscale
-    } else if (surface->channelSize == sizeof(uint8_t)) {
+    }
+    if (surface->channelSize == sizeof(uint8_t)) {
         auto buffer = surface->buf8 + (region.min.y * surface->stride);
         for (uint32_t y = 0; y < h; ++y) {
             _rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w);
         }
+        return true;
     }
-    return true;
+    return false;
 }
 
 
 static bool _rasterRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 {
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterMaskedRect(surface, region, r, g, b, a, alpha);
+        if (_matting(surface)) return _rasterMattedRect(surface, region, r, g, b, a);
+        else return _rasterMaskedRect(surface, region, r, g, b, a);
     } else {
         if (a == 255) {
             return _rasterSolidRect(surface, region, r, g, b);
@@ -267,17 +437,89 @@ static bool _rasterRect(SwSurface* surface, const SwBBox& region, uint8_t r, uin
 /* Rle                                                                  */
 /************************************************************************/
 
-static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a, SwAlpha alpha)
+template<typename maskOp>
+static void _rasterMaskedRleDup(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 {
-    TVGLOG("SW_ENGINE", "Masked Rle");
+    auto span = rle->spans;
+    auto cbuffer = surface->compositor->image.buf32;
+    auto cstride = surface->compositor->image.stride;
+    auto color = surface->blender.join(r, g, b, a);
+    uint32_t src;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto cmp = &cbuffer[span->y * cstride + span->x];
+        if (span->coverage == 255) src = color;
+        else src = ALPHA_BLEND(color, span->coverage);
+        auto ialpha = IALPHA(src);
+        for (auto x = 0; x < span->len; ++x, ++cmp) {
+            *cmp = maskOp()(src, *cmp, ialpha);
+        }
+    }
+}
 
+
+static void _rasterMaskedRleInt(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
     auto span = rle->spans;
+    auto cbuffer = surface->compositor->image.buf32;
+    auto cstride = surface->compositor->image.stride;
+    auto color = surface->blender.join(r, g, b, a);
     uint32_t src;
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        auto cmp = &cbuffer[y * cstride];
+        uint32_t x = surface->compositor->bbox.min.x;
+        while (x < surface->compositor->bbox.max.x) {
+            if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) {
+                if (span->coverage == 255) src = color;
+                else src = ALPHA_BLEND(color, span->coverage);
+                auto alpha = ALPHA(src);
+                for (uint32_t i = 0; i < span->len; ++i) {
+                    cmp[x + i] = ALPHA_BLEND(cmp[x + i], alpha);
+                }
+                x += span->len;
+                ++span;
+            } else {
+                cmp[x] = 0;
+                ++x;
+            }
+        }
+    }
+}
+
+
+static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    TVGLOG("SW_ENGINE", "Masked(%d) Rle", (int)surface->compositor->method);
+
+    //32bit channels composition
+    if (surface->channelSize != sizeof(uint32_t)) return false;
+
+    auto method = surface->compositor->method;
+
+    if (method == CompositeMethod::AddMask) _rasterMaskedRleDup<AddMaskOp>(surface, rle, r, g, b, a);
+    else if (method == CompositeMethod::SubtractMask) _rasterMaskedRleDup<SubMaskOp>(surface, rle, r, g, b, a);
+    else if (method == CompositeMethod::DifferenceMask) _rasterMaskedRleDup<DifMaskOp>(surface, rle, r, g, b, a);
+    else if (method == CompositeMethod::IntersectMask) _rasterMaskedRleInt(surface, rle, r, g, b, a);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterMattedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    TVGLOG("SW_ENGINE", "Matted(%d) Rle", (int)surface->compositor->method);
+
+    auto span = rle->spans;
     auto cbuffer = surface->compositor->image.buf8;
     auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
     //32bit channels
     if (surface->channelSize == sizeof(uint32_t)) {
+        uint32_t src;
         auto color = surface->blender.join(r, g, b, a);
         for (uint32_t i = 0; i < rle->size; ++i, ++span) {
             auto dst = &surface->buf32[span->y * surface->stride + span->x];
@@ -285,24 +527,26 @@ static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint
             if (span->coverage == 255) src = color;
             else src = ALPHA_BLEND(color, span->coverage);
             for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) {
-                auto tmp = ALPHA_BLEND(src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+                *dst = INTERPOLATE(src, *dst, alpha(cmp));
             }
         }
+        return true;
+    }
     //8bit grayscale
-    } else if (surface->channelSize == sizeof(uint8_t)) {
+    if (surface->channelSize == sizeof(uint8_t)) {
+        uint8_t src;
         for (uint32_t i = 0; i < rle->size; ++i, ++span) {
             auto dst = &surface->buf8[span->y * surface->stride + span->x];
             auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
             if (span->coverage == 255) src = a;
-            else src = _multiply<uint8_t>(a, span->coverage);
+            else src = MULTIPLY(a, span->coverage);
             for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) {
-                auto tmp = _multiply<uint8_t>(src, alpha(cmp));
-                *dst = tmp + _multiply<uint8_t>(*dst, _ialpha(tmp));
+                *dst = INTERPOLATE8(src, *dst, alpha(cmp));
             }
         }
+        return true;
     }
-    return true;
+    return false;
 }
 
 
@@ -328,14 +572,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r,
     //8bit grayscale
     } else if (surface->channelSize == sizeof(uint8_t)) {
         for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-            if (span->coverage == 255) {
-                _rasterGrayscale8(surface->buf8 + span->y * surface->stride, 255, span->x, span->len);
-            } else {
-                auto dst = &surface->buf8[span->y * surface->stride + span->x];
-                for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                    *dst = span->coverage;
-                }
-            }
+            _rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len);
         }
     }
     return true;
@@ -347,8 +584,8 @@ static bool _rasterRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g,
     if (!rle) return false;
 
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterMaskedRle(surface, rle, r, g, b, a, alpha);
+        if (_matting(surface)) return _rasterMattedRle(surface, rle, r, g, b, a);
+        else return _rasterMaskedRle(surface, rle, r, g, b, a);
     } else {
         if (a == 255) {
             return _rasterSolidRle(surface, rle, r, g, b);
@@ -372,156 +609,142 @@ static bool _rasterRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g,
 
 static bool _transformedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, uint32_t opacity)
 {
-    if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterTexmapPolygon(surface, image, transform, nullptr, opacity, alpha);
-    } else {
-        return _rasterTexmapPolygon(surface, image, transform, nullptr, opacity, nullptr);
+    auto ret = _rasterTexmapPolygon(surface, image, transform, nullptr, opacity);
+
+    //Masking Composition
+    if (_compositing(surface) && _masking(surface)) {
+        return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
     }
-    return false;
+
+    return ret;
+
 }
 
+
 /************************************************************************/
 /* RLE Scaled RGBA Image                                                */
 /************************************************************************/
 
-static bool _rasterScaledMaskedTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale, SwAlpha alpha)
+template<typename maskOp, typename amaskOp>
+static void _rasterScaledMaskedRleRGBAImageDup(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
-    TVGLOG("SW_ENGINE", "Scaled Masked Translucent Rle Image");
-
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
     auto span = image->rle->spans;
-    auto csize = surface->compositor->image.channelSize;
 
-    //Center (Down-Scaled)
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
-            auto a = _multiply<uint32_t>(span->coverage, opacity);
-            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), a);
-                auto tmp = ALPHA_BLEND(src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = &surface->compositor->image.buf32[span->y * surface->compositor->image.stride + span->x];
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *cmp = maskOp()(src, *cmp, 255);
             }
-        }
-    //Center (Up-Scaled)
-    } else {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = span->y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
-            auto a = _multiply<uint32_t>(span->coverage, opacity);
-            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp) {
                 auto sx = x * itransform->e11 + itransform->e13;
                 if ((uint32_t)sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), a);
-                auto tmp = ALPHA_BLEND(src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *cmp = amaskOp()(src, *cmp, a);
             }
         }
     }
-    return true;
 }
 
 
-static bool _rasterScaledMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale, SwAlpha alpha)
+static void _rasterScaledMaskedRleRGBAImageInt(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
-    TVGLOG("SW_ENGINE", "Scaled Masked Rle Image");
-
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
     auto span = image->rle->spans;
-    auto csize = surface->compositor->image.channelSize;
-
-    //Center (Down-Scaled)
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
-            if (span->coverage == 255) {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
-                    auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                    if (sx >= image->w) continue;
-                    auto tmp = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp));
-                    *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-                }
-            } else {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
-                    auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                    if (sx >= image->w) continue;
-                    auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), span->coverage);
-                    auto tmp = ALPHA_BLEND(src, alpha(cmp));
-                    *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-                }
-            }
-        }
-    //Center (Up-Scaled)
-    } else {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = span->y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
-            if (span->coverage == 255) {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
-                    auto sx = x * itransform->e11 + itransform->e13;
-                    if ((uint32_t)sx >= image->w) continue;
-                    auto tmp = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha(cmp));
-                    *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+    auto cbuffer = surface->compositor->image.buf32;
+    auto cstride = surface->compositor->image.stride;
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        auto cmp = &cbuffer[y * cstride];
+        for (uint32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x) {
+            if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) {
+                auto sy = span->y * itransform->e22 + itransform->e23;
+                if ((uint32_t)sy >= image->h) continue;
+                auto alpha = MULTIPLY(span->coverage, opacity);
+                if (alpha == 255) {
+                    for (uint32_t i = 0; i < span->len; ++i) {
+                        auto sx = (x + i) * itransform->e11 + itransform->e13;
+                        if ((uint32_t)sx >= image->w) continue;
+                        auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                        cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(src));
+                    }
+                } else {
+                    for (uint32_t i = 0; i < span->len; ++i) {
+                        auto sx = (x + i) * itransform->e11 + itransform->e13;
+                        if ((uint32_t)sx >= image->w) continue;
+                        auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                        cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(ALPHA_BLEND(src, alpha)));
+                    }
                 }
+                x += span->len - 1;
+                ++span;
             } else {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
-                    auto sx = x * itransform->e11 + itransform->e13;
-                    if ((uint32_t)sx >= image->w) continue;
-                    auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), span->coverage);
-                    auto tmp = ALPHA_BLEND(src, alpha(cmp));
-                    *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-                }
+                cmp[x] = 0;
             }
         }
     }
-    return true;
 }
 
 
-static bool _rasterScaledTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
+static bool _rasterScaledMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
+    TVGLOG("SW_ENGINE", "Scaled Masked(%d) Rle Image", (int)surface->compositor->method);
+
+    auto method = surface->compositor->method;
+
+    if (method == CompositeMethod::AddMask) _rasterScaledMaskedRleRGBAImageDup<AddMaskOp, AddMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::SubtractMask) _rasterScaledMaskedRleRGBAImageDup<SubMaskOp, SubMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRleRGBAImageDup<DifMaskOp, DifMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRleRGBAImageInt(surface, image, itransform, region, opacity, halfScale);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterScaledMattedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
+{
+    TVGLOG("SW_ENGINE", "Scaled Matted(%d) Rle Image", (int)surface->compositor->method);
+
     auto span = image->rle->spans;
+    auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    //Center (Down-Scaled)
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto alpha = _multiply<uint32_t>(span->coverage, opacity);
-            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto tmp = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
             }
-        }
-    //Center (Up-Scaled)
-    } else {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = span->y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            auto alpha = _multiply<uint32_t>(span->coverage, opacity);
-            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
                 auto sx = x * itransform->e11 + itransform->e13;
                 if ((uint32_t)sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                auto tmp = ALPHA_BLEND(src, MULTIPLY(alpha(cmp), a));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
             }
         }
     }
+
     return true;
 }
 
@@ -530,48 +753,26 @@ static bool _rasterScaledRleRGBAImage(SwSurface* surface, const SwImage* image,
 {
     auto span = image->rle->spans;
 
-    //Center (Down-Scaled)
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            if (span->coverage == 255) {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
-                    auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                    if (sx >= image->w) continue;
-                    auto src = _interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
-                    *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
-                }
-            } else {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
-                    auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                    if (sx >= image->w) continue;
-                    auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), span->coverage);
-                    *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
-                }
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *dst = src + ALPHA_BLEND(*dst, IALPHA(src));
             }
-        }
-    //Center (Up-Scaled)
-    } else {
-        for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-            auto sy = span->y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            if (span->coverage == 255) {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
-                    auto sx = x * itransform->e11 + itransform->e13;
-                    if ((uint32_t)sx >= image->w) continue;
-                    auto src = _interpUpScaler(image->buf32, image->w, image->h, sx, sy);
-                    *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
-                }
-            } else {
-                for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
-                    auto sx = x * itransform->e11 + itransform->e13;
-                    if ((uint32_t)sx >= image->w) continue;
-                    auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), span->coverage);
-                    *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
-                }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha);
+                *dst = src + ALPHA_BLEND(*dst, IALPHA(src));
             }
         }
     }
@@ -590,12 +791,10 @@ static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const
     auto halfScale = _halfScale(image->scale);
 
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        if (opacity == 255) return _rasterScaledMaskedRleRGBAImage(surface, image, &itransform, region, halfScale, alpha);
-        else return _rasterScaledMaskedTranslucentRleRGBAImage(surface, image, &itransform, region, opacity, halfScale, alpha);
+        if (_matting(surface)) _rasterScaledMattedRleRGBAImage(surface, image, &itransform, region, opacity, halfScale);
+        else _rasterScaledMaskedRleRGBAImage(surface, image, &itransform, region, opacity, halfScale);
     } else {
-        if (opacity == 255) return _rasterScaledRleRGBAImage(surface, image, &itransform, region, opacity, halfScale);
-        else return _rasterScaledTranslucentRleRGBAImage(surface, image, &itransform, region, opacity, halfScale);
+        return _rasterScaledRleRGBAImage(surface, image, &itransform, region, opacity, halfScale);
     }
     return false;
 }
@@ -605,56 +804,104 @@ static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const
 /* RLE Direct RGBA Image                                                */
 /************************************************************************/
 
-static bool _rasterDirectMaskedTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity, SwAlpha alpha)
+template<typename maskOp, typename amaskOp>
+static void _rasterDirectMaskedRleRGBAImageDup(SwSurface* surface, const SwImage* image, uint32_t opacity)
 {
-    TVGLOG("SW_ENGINE", "Direct Masked Rle Image");
-
     auto span = image->rle->spans;
-    auto csize = surface->compositor->image.channelSize;
-    auto cbuffer = surface->compositor->image.buf8;
+    auto cbuffer = surface->compositor->image.buf32;
+    auto ctride = surface->compositor->image.stride;
 
     for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-        auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
-        auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
-        auto a = _multiply<uint32_t>(span->coverage, opacity);
-        if (a == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*img, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+        auto src = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto cmp = &cbuffer[span->y * ctride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) {
+                *cmp = maskOp()(*src, *cmp, IALPHA(*src));
             }
         } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*img, _multiply<uint32_t>(a, alpha(cmp)));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) {
+                *cmp = amaskOp()(*src, *cmp, alpha);
             }
         }
     }
-    return true;
 }
 
 
-static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, SwAlpha alpha)
+static void _rasterDirectMaskedRleRGBAImageInt(SwSurface* surface, const SwImage* image, uint32_t opacity)
+{
+    auto span = image->rle->spans;
+    auto cbuffer = surface->compositor->image.buf32;
+    auto ctride = surface->compositor->image.stride;
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        auto cmp = &cbuffer[y * ctride];
+        auto x = surface->compositor->bbox.min.x;
+        while (x < surface->compositor->bbox.max.x) {
+            if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) {
+                auto alpha = MULTIPLY(span->coverage, opacity);
+                auto src = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+                if (alpha == 255) {
+                    for (uint32_t i = 0; i < span->len; ++i, ++src) {
+                        cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(*src));
+                    }
+                } else {
+                    for (uint32_t i = 0; i < span->len; ++i, ++src) {
+                        auto t = ALPHA_BLEND(*src, alpha);
+                        cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(t));
+                    }
+                }
+                x += span->len;
+                ++span;
+            } else {
+                cmp[x] = 0;
+                ++x;
+            }
+        }
+    }
+}
+
+
+static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
+{
+    TVGLOG("SW_ENGINE", "Direct Masked(%d) Rle Image", (int)surface->compositor->method);
+
+    auto method = surface->compositor->method;
+
+    if (method == CompositeMethod::AddMask) _rasterDirectMaskedRleRGBAImageDup<AddMaskOp, AddMaskAOp>(surface, image, opacity);
+    else if (method == CompositeMethod::SubtractMask) _rasterDirectMaskedRleRGBAImageDup<SubMaskOp, SubMaskAOp>(surface, image, opacity);
+    else if (method == CompositeMethod::DifferenceMask) _rasterDirectMaskedRleRGBAImageDup<DifMaskOp, DifMaskAOp>(surface, image, opacity);
+    else if (method == CompositeMethod::IntersectMask) _rasterDirectMaskedRleRGBAImageInt(surface, image, opacity);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectMattedRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
 {
-    TVGLOG("SW_ENGINE", "Direct Masked Rle Image");
+    TVGLOG("SW_ENGINE", "Direct Matted(%d) Rle Image", (int)surface->compositor->method);
 
     auto span = image->rle->spans;
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
     for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
         auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
         auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
-        if (span->coverage == 255) {
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
             for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
                 auto tmp = ALPHA_BLEND(*img, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
             }
         } else {
             for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*img, _multiply<uint32_t>(span->coverage, alpha(cmp)));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+                auto tmp = ALPHA_BLEND(*img, MULTIPLY(a, alpha(cmp)));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
             }
         }
     }
@@ -662,57 +909,37 @@ static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* i
 }
 
 
-static bool _rasterDirectTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
+static bool _rasterDirectRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
 {
     auto span = image->rle->spans;
 
     for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
         auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
-        auto alpha = _multiply<uint32_t>(span->coverage, opacity);
-        for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
-            auto src = ALPHA_BLEND(*img, alpha);
-            *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            *dst = *img + ALPHA_BLEND(*dst, IALPHA(*img));
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                auto src = ALPHA_BLEND(*img, alpha);
+                *dst = src + ALPHA_BLEND(*dst, IALPHA(src));
+            }
         }
     }
     return true;
 }
 
 
-static bool _rasterDirectRleRGBAImage(SwSurface* surface, const SwImage* image)
+static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
 {
-    auto span = image->rle->spans;
-
-    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
-        auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
-                *dst = *img + ALPHA_BLEND(*dst, _ialpha(*img));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
-                auto src = ALPHA_BLEND(*img, span->coverage);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
-            }
-        }
-    }
-    return true;
-}
-
-
-static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity)
-{
-    if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        if (opacity == 255) return _rasterDirectMaskedRleRGBAImage(surface, image, alpha);
-        else return _rasterDirectMaskedTranslucentRleRGBAImage(surface, image, opacity, alpha);
-    } else {
-        if (opacity == 255) return _rasterDirectRleRGBAImage(surface, image);
-        else return _rasterDirectTranslucentRleRGBAImage(surface, image, opacity);
-    }
-    return false;
-}
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterDirectMattedRleRGBAImage(surface, image, opacity);
+        else return _rasterDirectMaskedRleRGBAImage(surface, image, opacity);
+    } else {
+        return _rasterDirectRleRGBAImage(surface, image, opacity);
+    }
+    return false;
+}
 
 
 /************************************************************************/
@@ -721,24 +948,21 @@ static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32
 
 static bool _transformedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity)
 {
-    if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterTexmapPolygon(surface, image, transform, &region, opacity, alpha);
-    } else {
-        return _rasterTexmapPolygon(surface, image, transform, &region, opacity, nullptr);
+    auto ret = _rasterTexmapPolygon(surface, image, transform, &region, opacity);
+
+    //Masking Composition
+    if (_compositing(surface) && _masking(surface)) {
+        return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
     }
-    return false;
+
+    return ret;
 }
 
+
 static bool _transformedRGBAImageMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity)
 {
-    if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity, alpha);
-    } else {
-        return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity, nullptr);
-    }
-    return false;
+    //TODO: Not completed for all cases.
+    return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity);
 }
 
 
@@ -746,161 +970,169 @@ static bool _transformedRGBAImageMesh(SwSurface* surface, const SwImage* image,
 /*Scaled RGBA Image                                                     */
 /************************************************************************/
 
-static bool _rasterScaledMaskedTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale, SwAlpha alpha)
+template<typename maskOp, typename amaskOp>
+static void _rasterScaledMaskedRGBAImageDup(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
-    TVGLOG("SW_ENGINE", "Scaled Masked Image");
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x);
 
-    auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
-    auto csize = surface->compositor->image.channelSize;
-    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
-
-    // Down-Scaled
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (auto y = region.min.y; y < region.max.y; ++y) {
-            auto sy = (uint32_t)(y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = dbuffer;
-            auto cmp = cbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto a = _multiply<uint32_t>(opacity, alpha(cmp));
-                auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), a);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = cbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *cmp = maskOp()(src, *cmp, IALPHA(src));
             }
-            dbuffer += surface->stride;
-            cbuffer += surface->compositor->image.stride * csize;
-        }
-    // Up-Scaled
-    } else {
-        for (auto y = region.min.y; y < region.max.y; ++y) {
-            auto sy = y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = dbuffer;
-            auto cmp = cbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) {
                 auto sx = x * itransform->e11 + itransform->e13;
                 if ((uint32_t)sx >= image->w) continue;
-                auto a = _multiply<uint32_t>(opacity, alpha(cmp));
-                auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), a);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *cmp = amaskOp()(src, *cmp, opacity);
             }
-            dbuffer += surface->stride;
-            cbuffer += surface->compositor->image.stride * csize;
         }
+        cbuffer += cstride;
     }
-    return true;
 }
 
-
-static bool _rasterScaledMaskedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale, SwAlpha alpha)
+static void _rasterScaledMaskedRGBAImageInt(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
-    TVGLOG("SW_ENGINE", "Scaled Masked Image");
-
-    auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
-    auto csize = surface->compositor->image.channelSize;
-    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
-
-    // Down-Scaled
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (auto y = region.min.y; y < region.max.y; ++y) {
-            auto sy = (uint32_t)(y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = dbuffer;
-            auto cmp = cbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp));
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf32 + (surface->compositor->bbox.min.y * cstride + surface->compositor->bbox.min.x);
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        if (y == region.min.y) {
+            auto cbuffer2 = cbuffer;
+            for (uint32_t y2 = y; y2 < region.max.y; ++y2) {
+                auto sy = y2 * itransform->e22 + itransform->e23;
+                if ((uint32_t)sy >= image->h) continue;
+                auto tmp = cbuffer2;
+                auto x = surface->compositor->bbox.min.x;
+                while (x < surface->compositor->bbox.max.x) {
+                    if (x == region.min.x) {
+                        if (opacity == 255) {
+                            for (uint32_t i = 0; i < w; ++i, ++tmp) {
+                                auto sx = (x + i) * itransform->e11 + itransform->e13;
+                                if ((uint32_t)sx >= image->w) continue;
+                                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                                *tmp = ALPHA_BLEND(*tmp, ALPHA(src));
+                            }
+                        } else {
+                            for (uint32_t i = 0; i < w; ++i, ++tmp) {
+                                auto sx = (x + i) * itransform->e11 + itransform->e13;
+                                if ((uint32_t)sx >= image->w) continue;
+                                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity);
+                                *tmp = ALPHA_BLEND(*tmp, ALPHA(src));
+                            }
+                        }
+                        x += w;
+                    } else {
+                        *tmp = 0;
+                        ++tmp;
+                        ++x;
+                    }
+                }
+                cbuffer2 += cstride;
             }
-            dbuffer += surface->stride;
-            cbuffer += surface->compositor->image.stride * csize;
-        }
-    // Up-Scaled
-    } else {
-        for (auto y = region.min.y; y < region.max.y; ++y) {
-            auto sy = y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = dbuffer;
-            auto cmp = cbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
-                auto sx = x * itransform->e11 + itransform->e13;
-                if ((uint32_t)sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha(cmp));
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+            y += (h - 1);
+        } else {
+            auto tmp = cbuffer;
+            for (uint32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x, ++tmp) {
+                *tmp = 0;
             }
-            dbuffer += surface->stride;
-            cbuffer += surface->compositor->image.stride * csize;
         }
+        cbuffer += cstride;
     }
-    return true;
 }
 
 
-static bool _rasterScaledTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
+static bool _rasterScaledMaskedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
+{
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Scaled Masked(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    if (method == CompositeMethod::AddMask) _rasterScaledMaskedRGBAImageDup<AddMaskOp, AddMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::SubtractMask) _rasterScaledMaskedRGBAImageDup<SubMaskOp, SubMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::DifferenceMask) _rasterScaledMaskedRGBAImageDup<DifMaskOp, DifMaskAOp>(surface, image, itransform, region, opacity, halfScale);
+    else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRGBAImageInt(surface, image, itransform, region, opacity, halfScale);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterScaledMattedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
     auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    // Down-Scaled
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
-            auto sy = (uint32_t)(y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = dbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+    TVGLOG("SW_ENGINE", "Scaled Matted(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = dbuffer;
+        auto cmp = cbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                auto temp = ALPHA_BLEND(src, alpha(cmp));
+                *dst = temp + ALPHA_BLEND(*dst, IALPHA(temp));
             }
-        }
-    // Up-Scaled
-    } else {
-        for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
-            auto sy = fabsf(y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = dbuffer;
-            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
                 auto sx = x * itransform->e11 + itransform->e13;
                 if ((uint32_t)sx >= image->w) continue;
-                auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), opacity);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                auto temp = ALPHA_BLEND(src, MULTIPLY(opacity, alpha(cmp)));
+                *dst = temp + ALPHA_BLEND(*dst, IALPHA(temp));
             }
         }
+        dbuffer += surface->stride;
+        cbuffer += surface->compositor->image.stride * csize;
     }
     return true;
 }
 
 
-static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale)
+static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale)
 {
     auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
 
-    // Down-Scaled
-    if (image->scale < DOWN_SCALE_TOLERANCE) {
-        for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
-            auto sy = (uint32_t)(y * itransform->e22 + itransform->e23);
-            if (sy >= image->h) continue;
-            auto dst = dbuffer;
+    for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = dbuffer;
+        if (opacity == 255) {
             for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
-                auto sx = (uint32_t)(x * itransform->e11 + itransform->e13);
-                if (sx >= image->w) continue;
-                auto src = _interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale);
+                *dst = src + ALPHA_BLEND(*dst, IALPHA(src));
             }
-        }
-    // Up-Scaled
-    } else {
-        for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
-            auto sy = y * itransform->e22 + itransform->e23;
-            if ((uint32_t)sy >= image->h) continue;
-            auto dst = dbuffer;
+        } else {
             for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
                 auto sx = x * itransform->e11 + itransform->e13;
                 if ((uint32_t)sx >= image->w) continue;
-                auto src = _interpUpScaler(image->buf32, image->w, image->h, sx, sy);
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity);
+                *dst = src + ALPHA_BLEND(*dst, IALPHA(src));
             }
         }
     }
@@ -919,12 +1151,10 @@ static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Mat
     auto halfScale = _halfScale(image->scale);
 
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        if (opacity == 255) return _rasterScaledMaskedRGBAImage(surface, image, &itransform, region, halfScale, alpha);
-        else return _rasterScaledMaskedTranslucentRGBAImage(surface, image, &itransform, region, opacity, halfScale, alpha);
+        if (_matting(surface)) return _rasterScaledMattedRGBAImage(surface, image, &itransform, region, opacity, halfScale);
+        else return _rasterScaledMaskedRGBAImage(surface, image, &itransform, region, opacity, halfScale);
     } else {
-        if (opacity == 255) return _rasterScaledRGBAImage(surface, image, &itransform, region, halfScale);
-        else return _rasterScaledTranslucentRGBAImage(surface, image, &itransform, region, opacity, halfScale);
+        return _rasterScaledRGBAImage(surface, image, &itransform, region, opacity, halfScale);
     }
     return false;
 }
@@ -934,53 +1164,122 @@ static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Mat
 /* Direct RGBA Image                                                    */
 /************************************************************************/
 
-static bool _rasterDirectMaskedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, SwAlpha alpha)
+template<typename maskOp, typename amaskOp>
+static void _rasterDirectMaskedRGBAImageDup(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
 {
-    TVGLOG("SW_ENGINE", "Direct Masked Image");
-
-    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-    auto h2 = static_cast<uint32_t>(region.max.y - region.min.y);
-    auto w2 = static_cast<uint32_t>(region.max.x - region.min.x);
-    auto csize = surface->compositor->image.channelSize;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
 
+    auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x); //compositor buffer
     auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
-    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; //compositor buffer
 
-    for (uint32_t y = 0; y < h2; ++y) {
-        auto dst = buffer;
+    for (uint32_t y = 0; y < h; ++y) {
         auto cmp = cbuffer;
         auto src = sbuffer;
-        for (uint32_t x = 0; x < w2; ++x, ++dst, ++src, cmp += csize) {
-            auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-            *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+        if (opacity == 255) {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) {
+                *cmp = maskOp()(*src, *cmp, IALPHA(*src));
+            }
+        } else {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) {
+                *cmp = amaskOp()(*src, *cmp, opacity);
+            }
         }
-        buffer += surface->stride;
-        cbuffer += surface->compositor->image.stride * csize;
+        cbuffer += cstride;
         sbuffer += image->stride;
     }
-    return true;
 }
 
 
-static bool _rasterDirectMaskedTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity, SwAlpha alpha)
+static void _rasterDirectMaskedRGBAImageInt(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
 {
-    TVGLOG("SW_ENGINE", "Direct Masked Translucent Image");
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf32 + (surface->compositor->bbox.min.y * cstride + surface->compositor->bbox.min.x);
+
+    for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+        if (y == region.min.y) {
+            auto cbuffer2 = cbuffer;
+            for (uint32_t y2 = y; y2 < region.max.y; ++y2) {
+                auto tmp = cbuffer2;
+                auto x = surface->compositor->bbox.min.x;
+                while (x < surface->compositor->bbox.max.x) {
+                    if (x == region.min.x) {
+                        auto src = &image->buf32[(y2 + image->oy) * image->stride + (x + image->ox)];
+                        if (opacity == 255) {
+                            for (uint32_t i = 0; i < w; ++i, ++tmp, ++src) {
+                                *tmp = ALPHA_BLEND(*tmp, ALPHA(*src));
+                            }
+                        } else {
+                            for (uint32_t i = 0; i < w; ++i, ++tmp, ++src) {
+                                auto t = ALPHA_BLEND(*src, opacity);
+                                *tmp = ALPHA_BLEND(*tmp, ALPHA(t));
+                            }
+                        }
+                        x += w;
+                    } else {
+                        *tmp = 0;
+                        ++tmp;
+                        ++x;
+                    }
+                }
+                cbuffer2 += cstride;
+            }
+            y += (h - 1);
+        } else {
+            rasterRGBA32(cbuffer, 0x00000000, 0, surface->compositor->bbox.max.x - surface->compositor->bbox.min.x);
+        }
+        cbuffer += cstride;
+    }
+}
 
+
+static bool _rasterDirectMaskedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
+{
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Direct Masked(%d) Image  [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    if (method == CompositeMethod::AddMask) _rasterDirectMaskedRGBAImageDup<AddMaskOp, AddMaskAOp>(surface, image, region, opacity);
+    else if (method == CompositeMethod::SubtractMask) _rasterDirectMaskedRGBAImageDup<SubMaskOp, SubMaskAOp>(surface, image, region, opacity);
+    else if (method == CompositeMethod::DifferenceMask) _rasterDirectMaskedRGBAImageDup<DifMaskOp, DifMaskAOp>(surface, image, region, opacity);
+    else if (method == CompositeMethod::IntersectMask) _rasterDirectMaskedRGBAImageInt(surface, image, region, opacity);
+    else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectMattedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
+{
     auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-    auto h2 = static_cast<uint32_t>(region.max.y - region.min.y);
-    auto w2 = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
     auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
+
+    TVGLOG("SW_ENGINE", "Direct Matted(%d) Image  [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
 
     auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
     auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; //compositor buffer
 
-    for (uint32_t y = 0; y < h2; ++y) {
+    for (uint32_t y = 0; y < h; ++y) {
         auto dst = buffer;
         auto cmp = cbuffer;
         auto src = sbuffer;
-        for (uint32_t x = 0; x < w2; ++x, ++dst, ++src, cmp += csize) {
-            auto tmp = ALPHA_BLEND(*src, _multiply<uint32_t>(opacity, alpha(cmp)));
-            *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+        if (opacity == 255) {
+            for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
+            }
+        } else {
+            for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                auto tmp = ALPHA_BLEND(*src, MULTIPLY(opacity, alpha(cmp)));
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
+            }
         }
         buffer += surface->stride;
         cbuffer += surface->compositor->image.stride * csize;
@@ -990,7 +1289,7 @@ static bool _rasterDirectMaskedTranslucentRGBAImage(SwSurface* surface, const Sw
 }
 
 
-static bool _rasterDirectTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
+static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
 {
     auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x];
     auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
@@ -998,27 +1297,15 @@ static bool _rasterDirectTranslucentRGBAImage(SwSurface* surface, const SwImage*
     for (auto y = region.min.y; y < region.max.y; ++y) {
         auto dst = dbuffer;
         auto src = sbuffer;
-        for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) {
-            auto tmp = ALPHA_BLEND(*src, opacity);
-            *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-        }
-        dbuffer += surface->stride;
-        sbuffer += image->stride;
-    }
-    return true;
-}
-
-
-static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region)
-{
-    auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x];
-    auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
-
-    for (auto y = region.min.y; y < region.max.y; ++y) {
-        auto dst = dbuffer;
-        auto src = sbuffer;
-        for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) {
-            *dst = *src + ALPHA_BLEND(*dst, _ialpha(*src));
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) {
+                *dst = *src + ALPHA_BLEND(*dst, IALPHA(*src));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) {
+                auto tmp = ALPHA_BLEND(*src, opacity);
+                *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp));
+            }
         }
         dbuffer += surface->stride;
         sbuffer += image->stride;
@@ -1031,12 +1318,10 @@ static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, con
 static bool _directRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity)
 {
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        if (opacity == 255) return _rasterDirectMaskedRGBAImage(surface, image, region, alpha);
-        else return _rasterDirectMaskedTranslucentRGBAImage(surface, image, region, opacity, alpha);
+        if (_matting(surface)) return _rasterDirectMattedRGBAImage(surface, image, region, opacity);
+        else return _rasterDirectMaskedRGBAImage(surface, image, region, opacity);
     } else {
-        if (opacity == 255) return _rasterDirectRGBAImage(surface, image, region);
-        else return _rasterDirectTranslucentRGBAImage(surface, image, region, opacity);
+        return _rasterDirectRGBAImage(surface, image, region, opacity);
     }
     return false;
 }
@@ -1060,31 +1345,82 @@ static bool _rasterRGBAImage(SwSurface* surface, SwImage* image, const Matrix* t
 
 
 /************************************************************************/
-/* Rect Linear Gradient                                                 */
+/* Rect Gradient                                                        */
 /************************************************************************/
 
-static bool _rasterLinearGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwAlpha alpha)
+template<typename fillMethod>
+static bool _rasterGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
 {
-    if (fill->linear.len < FLT_EPSILON) return false;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x);
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Masked(%d) Gradient [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
+
+    if (method == CompositeMethod::AddMask) {
+        for (uint32_t y = 0; y < h; ++y) {
+            fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opAddMask, 255);
+            cbuffer += surface->stride;
+        }
+    } else if (method == CompositeMethod::SubtractMask) {
+        for (uint32_t y = 0; y < h; ++y) {
+            fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opSubMask, 255);
+            cbuffer += surface->stride;
+        }
+    } else if (method == CompositeMethod::IntersectMask) {
+        for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+            auto cmp = surface->compositor->image.buf32 + (y * cstride + surface->compositor->bbox.min.x);
+            if (y == region.min.y) {
+                for (uint32_t y2 = y; y2 < region.max.y; ++y2) {
+                    auto tmp = cmp;
+                    auto x = surface->compositor->bbox.min.x;
+                    while (x < surface->compositor->bbox.max.x) {
+                        if (x == region.min.x) {
+                            fillMethod()(fill, tmp, y2, x, w, opIntMask, 255);
+                            x += w;
+                            tmp += w;
+                        } else {
+                            *tmp = 0;
+                            ++tmp;
+                            ++x;
+                        }
+                    }
+                    cmp += cstride;
+                }
+                y += (h - 1);
+            } else {
+                rasterRGBA32(cmp, 0x00000000, 0, surface->compositor->bbox.max.x -surface->compositor->bbox.min.x);
+                cmp += cstride;
+            }
+        }
+    } else if (method == CompositeMethod::DifferenceMask) {
+        for (uint32_t y = 0; y < h; ++y) {
+            fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opDifMask, 255);
+            cbuffer += surface->stride;
+        }
+    } else return false;
+
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox, 255);
+}
+
 
+template<typename fillMethod>
+static bool _rasterGradientMattedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
     auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
+    TVGLOG("SW_ENGINE", "Matted(%d) Gradient [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
 
     for (uint32_t y = 0; y < h; ++y) {
-        fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w);
-        auto dst = buffer;
-        auto cmp = cbuffer;
-        auto src = sbuffer;
-        for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
-            auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-            *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-        }
+        fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, cbuffer, alpha, csize, 255);
         buffer += surface->stride;
         cbuffer += surface->stride * csize;
     }
@@ -1092,39 +1428,30 @@ static bool _rasterLinearGradientMaskedRect(SwSurface* surface, const SwBBox& re
 }
 
 
-static bool _rasterTranslucentLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+template<typename fillMethod>
+static bool _rasterTranslucentGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
 {
-    if (fill->linear.len < FLT_EPSILON) return false;
-
     auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
     for (uint32_t y = 0; y < h; ++y) {
-        auto dst = buffer;
-        fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w);
-        for (uint32_t x = 0; x < w; ++x, ++dst) {
-            *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x]));
-        }
+        fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, opBlend, 255);
         buffer += surface->stride;
     }
     return true;
 }
 
 
-static bool _rasterSolidLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+template<typename fillMethod>
+static bool _rasterSolidGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
 {
-    if (fill->linear.len < FLT_EPSILON) return false;
-
     auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
 
     for (uint32_t y = 0; y < h; ++y) {
-        fillFetchLinear(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w);
+        fillMethod()(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w);
     }
     return true;
 }
@@ -1132,300 +1459,158 @@ static bool _rasterSolidLinearGradientRect(SwSurface* surface, const SwBBox& reg
 
 static bool _rasterLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
 {
+    if (fill->linear.len < FLT_EPSILON) return false;
+
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterLinearGradientMaskedRect(surface, region, fill, alpha);
+        if (_matting(surface)) return _rasterGradientMattedRect<FillLinear>(surface, region, fill);
+        else return _rasterGradientMaskedRect<FillLinear>(surface, region, fill);
     } else {
-        if (fill->translucent) return _rasterTranslucentLinearGradientRect(surface, region, fill);
-        else _rasterSolidLinearGradientRect(surface, region, fill);
+        if (fill->translucent) return _rasterTranslucentGradientRect<FillLinear>(surface, region, fill);
+        else _rasterSolidGradientRect<FillLinear>(surface, region, fill);
     }
     return false;
 }
 
 
-/************************************************************************/
-/* Rle Linear Gradient                                                  */
-/************************************************************************/
-
-static bool _rasterLinearGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwAlpha alpha)
-{
-    if (fill->linear.len < FLT_EPSILON) return false;
-
-    auto span = rle->spans;
-    auto csize = surface->compositor->image.channelSize;
-    auto cbuffer = surface->compositor->image.buf8;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
-
-    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        fillFetchLinear(fill, buffer, span->y, span->x, span->len);
-        auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
-        auto src = buffer;
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        } else {
-            auto ialpha = 255 - span->coverage;
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-                tmp = ALPHA_BLEND(tmp, span->coverage) + ALPHA_BLEND(*dst, ialpha);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
-    }
-    return true;
-}
-
-
-static bool _rasterTranslucentLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
-{
-    if (fill->linear.len < FLT_EPSILON) return false;
-
-    auto span = rle->spans;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
-
-    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        fillFetchLinear(fill, buffer, span->y, span->x, span->len);
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
-    }
-    return true;
-}
-
-
-static bool _rasterSolidLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
-{
-    if (fill->linear.len < FLT_EPSILON) return false;
-
-    auto buf = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buf) return false;
-
-    auto span = rle->spans;
-
-    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        if (span->coverage == 255) {
-            fillFetchLinear(fill, surface->buf32 + span->y * surface->stride + span->x, span->y, span->x, span->len);
-        } else {
-            fillFetchLinear(fill, buf, span->y, span->x, span->len);
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            for (uint32_t x = 0; x < span->len; ++x) {
-                dst[x] = INTERPOLATE(span->coverage, buf[x], dst[x]);
-            }
-        }
-    }
-    return true;
-}
-
-
-static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
 {
-    if (!rle) return false;
+    if (fill->radial.a < FLT_EPSILON) return false;
 
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterLinearGradientMaskedRle(surface, rle, fill, alpha);
+        if (_matting(surface)) return _rasterGradientMattedRect<FillRadial>(surface, region, fill);
+        else return _rasterGradientMaskedRect<FillRadial>(surface, region, fill);
     } else {
-        if (fill->translucent) return _rasterTranslucentLinearGradientRle(surface, rle, fill);
-        else return _rasterSolidLinearGradientRle(surface, rle, fill);
+        if (fill->translucent) return _rasterTranslucentGradientRect<FillRadial>(surface, region, fill);
+        else _rasterSolidGradientRect<FillRadial>(surface, region, fill);
     }
     return false;
 }
 
 
+
 /************************************************************************/
-/* Rect Radial Gradient                                                 */
+/* Rle Gradient                                                         */
 /************************************************************************/
 
-static bool _rasterRadialGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwAlpha alpha)
+template<typename fillMethod>
+static bool _rasterGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
 {
-    if (fill->radial.a < FLT_EPSILON) return false;
-
-    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
-    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
-    auto csize = surface->compositor->image.channelSize;
-    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
+    TVGLOG("SW_ENGINE", "Masked(%d) Rle Linear Gradient", (int)surface->compositor->method);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
+    auto span = rle->spans;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf32;
+    auto method = surface->compositor->method;
 
-    for (uint32_t y = 0; y < h; ++y) {
-        fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w);
-        auto dst = buffer;
-        auto cmp = cbuffer;
-        auto src = sbuffer;
-        for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
-             auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-             *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
+    if (method == CompositeMethod::AddMask) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto cmp = &cbuffer[span->y * cstride + span->x];
+            fillMethod()(fill, cmp, span->y, span->x, span->len, opAddMask, span->coverage);
         }
-        buffer += surface->stride;
-        cbuffer += surface->stride * csize;
-    }
-    return true;
-}
-
-
-static bool _rasterTranslucentRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
-{
-    if (fill->radial.a < FLT_EPSILON) return false;
-
-    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
-    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
-
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
-    for (uint32_t y = 0; y < h; ++y) {
-        auto dst = buffer;
-        fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w);
-        for (uint32_t x = 0; x < w; ++x, ++dst) {
-            *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x]));
+    } else if (method == CompositeMethod::SubtractMask) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto cmp = &cbuffer[span->y * cstride + span->x];
+            fillMethod()(fill, cmp, span->y, span->x, span->len, opSubMask, span->coverage);
         }
-        buffer += surface->stride;
-    }
-    return true;
-}
-
-
-static bool _rasterSolidRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
-{
-    if (fill->radial.a < FLT_EPSILON) return false;
-
-    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
-    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
-
-    for (uint32_t y = 0; y < h; ++y) {
-        auto dst = &buffer[y * surface->stride];
-        fillFetchRadial(fill, dst, region.min.y + y, region.min.x, w);
-    }
-    return true;
-}
-
+    } else if (method == CompositeMethod::IntersectMask) {
+        for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) {
+            auto cmp = &cbuffer[y * cstride];
+            uint32_t x = surface->compositor->bbox.min.x;
+            while (x < surface->compositor->bbox.max.x) {
+                if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) {
+                    fillMethod()(fill, cmp, span->y, span->x, span->len, opIntMask, span->coverage);
+                    x += span->len;
+                    ++span;
+                } else {
+                    cmp[x] = 0;
+                    ++x;
+                }
+            }
+        }
+    } else if (method == CompositeMethod::DifferenceMask) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto cmp = &cbuffer[span->y * cstride + span->x];
+            fillMethod()(fill, cmp, span->y, span->x, span->len, opDifMask, span->coverage);
+        }
+    } else return false;
 
-static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
-{
-    if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterRadialGradientMaskedRect(surface, region, fill, alpha);
-    } else {
-        if (fill->translucent) return _rasterTranslucentRadialGradientRect(surface, region, fill);
-        else return _rasterSolidRadialGradientRect(surface, region, fill);
-    }
-    return false;
+    //Masking Composition
+    return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox, 255);
 }
 
 
-/************************************************************************/
-/* RLE Radial Gradient                                                  */
-/************************************************************************/
-
-static bool _rasterRadialGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwAlpha alpha)
+template<typename fillMethod>
+static bool _rasterGradientMattedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
 {
-    if (fill->radial.a < FLT_EPSILON) return false;
+    TVGLOG("SW_ENGINE", "Matted(%d) Rle Linear Gradient", (int)surface->compositor->method);
 
     auto span = rle->spans;
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        fillFetchRadial(fill, buffer, span->y, span->x, span->len);
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
         auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
-        auto src = buffer;
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = INTERPOLATE(span->coverage, ALPHA_BLEND(*src, alpha(cmp)), *dst);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        fillMethod()(fill, dst, span->y, span->x, span->len, cmp, alpha, csize, span->coverage);
     }
     return true;
 }
 
 
-static bool _rasterTranslucentRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+template<typename fillMethod>
+static bool _rasterTranslucentGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
 {
-    if (fill->radial.a < FLT_EPSILON) return false;
-
     auto span = rle->spans;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        fillFetchRadial(fill, buffer, span->y, span->x, span->len);
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
-            }
-        } else {
-           for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len, opBlend, 255);
+        else fillMethod()(fill, dst, span->y, span->x, span->len, opAlphaBlend, span->coverage);
     }
     return true;
 }
 
 
-static bool _rasterSolidRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+template<typename fillMethod>
+static bool _rasterSolidGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
 {
-    if (fill->radial.a < FLT_EPSILON) return false;
-
-    auto buf = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buf) return false;
-
     auto span = rle->spans;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        if (span->coverage == 255) {
-            fillFetchRadial(fill, dst, span->y, span->x, span->len);
-        } else {
-            fillFetchRadial(fill, buf, span->y, span->x, span->len);
-            auto ialpha = 255 - span->coverage;
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = ALPHA_BLEND(buf[x], span->coverage) + ALPHA_BLEND(*dst, ialpha);
-            }
-        }
+        if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len);
+        else fillMethod()(fill, dst, span->y, span->x, span->len, opInterpolate, span->coverage);
     }
     return true;
 }
 
 
+static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    if (!rle || fill->linear.len < FLT_EPSILON) return false;
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterGradientMattedRle<FillLinear>(surface, rle, fill);
+        else return _rasterGradientMaskedRle<FillLinear>(surface, rle, fill);
+    } else {
+        if (fill->translucent) return _rasterTranslucentGradientRle<FillLinear>(surface, rle, fill);
+        else return _rasterSolidGradientRle<FillLinear>(surface, rle, fill);
+    }
+    return false;
+}
+
+
 static bool _rasterRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
 {
-    if (!rle) return false;
+    if (!rle || fill->radial.a < FLT_EPSILON) return false;
 
     if (_compositing(surface)) {
-        auto alpha = surface->blender.alpha(surface->compositor->method);
-        return _rasterRadialGradientMaskedRle(surface, rle, fill, alpha);
+        if (_matting(surface)) return _rasterGradientMattedRle<FillRadial>(surface, rle, fill);
+        else return _rasterGradientMaskedRle<FillRadial>(surface, rle, fill);
     } else {
-        if (fill->translucent) _rasterTranslucentRadialGradientRle(surface, rle, fill);
-        else return _rasterSolidRadialGradientRle(surface, rle, fill);
+        if (fill->translucent) _rasterTranslucentGradientRle<FillRadial>(surface, rle, fill);
+        else return _rasterSolidGradientRle<FillRadial>(surface, rle, fill);
     }
     return false;
 }
@@ -1449,8 +1634,8 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
 bool rasterCompositor(SwSurface* surface)
 {
     //See CompositeMethod, Alpha:3, InvAlpha:4, Luma:5, InvLuma:6
-    surface->blender.alphas[0] = _alpha;
-    surface->blender.alphas[1] = _ialpha;
+    surface->blender.alphas[0] = ALPHA;
+    surface->blender.alphas[1] = IALPHA;
 
     if (surface->cs == ColorSpace::ABGR8888 || surface->cs == ColorSpace::ABGR8888S) {
         surface->blender.join = _abgrJoin;
@@ -1472,20 +1657,24 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_
 {
     if (!surface || !surface->buf32 || surface->stride == 0 || surface->w == 0 || surface->h == 0) return false;
 
-    //full clear
+    //32 bits
     if (surface->channelSize == sizeof(uint32_t)) {
+        //full clear
         if (w == surface->stride) {
             rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h);
+        //partial clear
         } else {
             auto buffer = surface->buf32 + (surface->stride * y + x);
             for (uint32_t i = 0; i < h; i++) {
                 rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w);
             }
         }
-    //partial clear
+    //8 bits
     } else if (surface->channelSize == sizeof(uint8_t)) {
+        //full clear
         if (w == surface->stride) {
             _rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h);
+        //partial clear
         } else {
             auto buffer = surface->buf8 + (surface->stride * y + x);
             for (uint32_t i = 0; i < h; i++) {
@@ -1586,9 +1775,9 @@ bool rasterGradientStroke(SwSurface* surface, SwShape* shape, unsigned id)
 bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 {
     if (a < 255) {
-        r = _multiply<uint32_t>(r, a);
-        g = _multiply<uint32_t>(g, a);
-        b = _multiply<uint32_t>(b, a);
+        r = MULTIPLY(r, a);
+        g = MULTIPLY(g, a);
+        b = MULTIPLY(b, a);
     }
 
     if (shape->fastTrack) return _rasterRect(surface, shape->bbox, r, g, b, a);
@@ -1599,9 +1788,9 @@ bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8
 bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 {
     if (a < 255) {
-        r = _multiply<uint32_t>(r, a);
-        g = _multiply<uint32_t>(g, a);
-        b = _multiply<uint32_t>(b, a);
+        r = MULTIPLY(r, a);
+        g = MULTIPLY(g, a);
+        b = MULTIPLY(b, a);
     }
 
     return _rasterRle(surface, shape->strokeRle, r, g, b, a);
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h
index 59a83ab06..a73768055 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h
@@ -148,7 +148,7 @@ static bool avxRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, ui
         if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
         else src = color;
 
-	auto ialpha = _ialpha(src);
+	auto ialpha = IALPHA(src);
 
         //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required)
         auto notAligned = ((uintptr_t)dst & 0xf) / 4;
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h
index 18a096648..a040269ff 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h
@@ -40,8 +40,9 @@ static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRleData* rl
             auto dst = &surface->buf32[span->y * surface->stride + span->x];
             if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
             else src = color;
+            auto ialpha = IALPHA(src);
             for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = src + ALPHA_BLEND(*dst, _ialpha(src));
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
             }
         }
     //8bit grayscale
@@ -49,10 +50,11 @@ static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRleData* rl
         uint8_t src;
         for (uint32_t i = 0; i < rle->size; ++i, ++span) {
             auto dst = &surface->buf8[span->y * surface->stride + span->x];
-            if (span->coverage < 255) src = _multiply<uint8_t>(span->coverage, a);
+            if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
             else src = a;
+            auto ialpha = ~a;
             for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = src + _multiply<uint8_t>(*dst, ~src);
+                *dst = src + MULTIPLY(*dst, ialpha);
             }
         }
     }
@@ -69,7 +71,7 @@ static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& regi
     if (surface->channelSize == sizeof(uint32_t)) {
         auto color = surface->blender.join(r, g, b, a);
         auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
-        auto ialpha = _ialpha(color);
+        auto ialpha = IALPHA(color);
         for (uint32_t y = 0; y < h; ++y) {
             auto dst = &buffer[y * surface->stride];
             for (uint32_t x = 0; x < w; ++x, ++dst) {
@@ -79,10 +81,11 @@ static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& regi
     //8bit grayscale
     } else if (surface->channelSize == sizeof(uint8_t)) {
         auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = ~a;
         for (uint32_t y = 0; y < h; ++y) {
             auto dst = &buffer[y * surface->stride];
             for (uint32_t x = 0; x < w; ++x, ++dst) {
-                *dst = a + _multiply<uint8_t>(*dst, ~a);
+                *dst = a + MULTIPLY(*dst, ialpha);
             }
         }
     }
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h
new file mode 100644
index 000000000..0183b63cc
--- /dev/null
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef TEXMAP_INT_MASK
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    int32_t sw = static_cast<int32_t>(image->stride);
+    int32_t sh = image->h;
+    int32_t x1, x2, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = INT32_MIN;
+    float dx, u, v, iptr;
+    auto cbuffer = surface->compositor->image.buf32;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Clear out of the Polygon vertical ranges
+    auto size = surface->compositor->bbox.max.x - surface->compositor->bbox.min.x;
+    if (dirFlag == 1) {     //left top case.
+        for(int y = surface->compositor->bbox.min.y; y < yStart; ++y) {
+            rasterRGBA32(surface->compositor->image.buf32 + y * surface->compositor->image.stride, 0, surface->compositor->bbox.min.x, size);
+        }
+    }
+    if (dirFlag == 4) {     //right bottom case.
+        for(int y = yEnd; y < surface->compositor->bbox.max.y; ++y) {
+            rasterRGBA32(surface->compositor->image.buf32 + y * surface->compositor->image.stride, 0, surface->compositor->bbox.min.x, size);
+        }
+    }
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    for (int32_t y = yStart; y < yEnd; ++y) {
+        auto cmp = &cbuffer[y * surface->compositor->image.stride];
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = INT32_MIN;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        //FIXME: this aa must be applied before masking op
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+            for (int32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x) {
+                //Range allowed
+                if (x >= x1 && x < x2) {                                            
+                    //Perform subtexel pre-stepping on UV
+                    dx = 1 - (_xa - x1);
+                    u = _ua + dx * _dudx;
+                    v = _va + dx * _dvdx;
+                    if ((uint32_t)v >= image->h) {
+                        cmp[x] = 0;
+                    } else {
+                        if (opacity == 255) {
+                            uu = (int) u;
+                            vv = (int) v;
+                            ar = (int)(255 * (1 - modff(u, &iptr)));
+                            ab = (int)(255 * (1 - modff(v, &iptr)));
+                            iru = uu + 1;
+                            irv = vv + 1;
+
+                            if (vv >= sh) continue;
+                                    
+                            px = *(sbuf + (vv * sw) + uu);
+
+                            /* horizontal interpolate */
+                            if (iru < sw) {
+                                /* right pixel */
+                                int px2 = *(sbuf + (vv * sw) + iru);
+                                px = INTERPOLATE(px, px2, ar);
+                            }
+                            /* vertical interpolate */
+                            if (irv < sh) {
+                                /* bottom pixel */
+                                int px2 = *(sbuf + (irv * sw) + uu);
+
+                                /* horizontal interpolate */
+                                if (iru < sw) {
+                                    /* bottom right pixel */
+                                    int px3 = *(sbuf + (irv * sw) + iru);
+                                    px2 = INTERPOLATE(px2, px3, ar);
+                                }
+                                px = INTERPOLATE(px, px2, ab);
+                            }       
+                            cmp[x] = ALPHA_BLEND(cmp[x], ALPHA(px));
+
+                            //Step UV horizontally
+                            u += _dudx;
+                            v += _dvdx;
+                        } else {
+                            uu = (int) u;
+                            vv = (int) v;
+                            ar = (int)(255 * (1 - modff(u, &iptr)));
+                            ab = (int)(255 * (1 - modff(v, &iptr)));
+                            iru = uu + 1;
+                            irv = vv + 1;
+
+                            if (vv >= sh) continue;
+                                    
+                            px = *(sbuf + (vv * sw) + uu);
+
+                            /* horizontal interpolate */
+                            if (iru < sw) {
+                                /* right pixel */
+                                int px2 = *(sbuf + (vv * sw) + iru);
+                                px = INTERPOLATE(px, px2, ar);
+                            }
+                            /* vertical interpolate */
+                            if (irv < sh) {
+                                /* bottom pixel */
+                                int px2 = *(sbuf + (irv * sw) + uu);
+
+                                /* horizontal interpolate */
+                                if (iru < sw) {
+                                    /* bottom right pixel */
+                                    int px3 = *(sbuf + (irv * sw) + iru);
+                                    px2 = INTERPOLATE(px2, px3, ar);
+                                }
+                                px = INTERPOLATE(px, px2, ab);
+                            }
+                            cmp[x] = ALPHA_BLEND(cmp[x], MULTIPLY(ALPHA(px), opacity));
+
+                            //Step UV horizontally
+                            u += _dudx;
+                            v += _dvdx;
+                        }
+                    }
+                } else {
+                    //Clear out of polygon horizontal range
+                    if (x < x1 && (dirFlag == 1 || dirFlag == 2)) cmp[x] = 0;
+                    else if (x >= x2 && (dirFlag == 3 || dirFlag == 4)) cmp[x] = 0;                    
+                }
+            }
+        }
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+#else
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    int32_t sw = static_cast<int32_t>(image->stride);
+    int32_t sh = image->h;
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = INT32_MIN;
+    float dx, u, v, iptr;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = INT32_MIN;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            x = x1;
+
+            auto cmp = &surface->compositor->image.buf32[y * surface->compositor->image.stride + x1];
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    if (vv >= sh) continue;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+#ifdef TEXMAP_ADD_MASK
+                    *cmp = px + ALPHA_BLEND(*cmp, IALPHA(px));
+#elif defined(TEXMAP_SUB_MASK)
+                    *cmp = ALPHA_BLEND(*cmp, IALPHA(px));
+#elif defined(TEXMAP_DIF_MASK)
+                    *cmp = ALPHA_BLEND(px, IALPHA(*cmp)) + ALPHA_BLEND(*cmp, IALPHA(px));
+#endif
+                    ++cmp;
+    
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    if (vv >= sh) continue;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+#ifdef TEXMAP_ADD_MASK
+                    *cmp = INTERPOLATE(px, *cmp, opacity);
+#elif defined(TEXMAP_SUB_MASK)
+                    *cmp = ALPHA_BLEND(*cmp, IALPHA(ALPHA_BLEND(px, opacity)));
+#elif defined(TEXMAP_DIF_MASK)
+                    auto src = ALPHA_BLEND(px, opacity);
+                    *cmp = ALPHA_BLEND(src, IALPHA(*cmp)) + ALPHA_BLEND(*cmp, IALPHA(src));
+#endif
+                    ++cmp;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+#endif
\ No newline at end of file
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h
index 5bed2f85a..0b581fbd6 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h
@@ -67,7 +67,7 @@ static bool neonRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, u
         else src = color;
 
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        auto ialpha = 255 - _alpha(src);
+        auto ialpha = IALPHA(src);
 
         if ((((size_t) dst) & 0x7) != 0) {
             //fill not aligned byte
@@ -105,7 +105,7 @@ static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region,
     auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
-    auto ialpha = 255 - _alpha(color);
+    auto ialpha = IALPHA(color);
 
     auto vColor = vdup_n_u32(color);
     auto vIalpha = vdup_n_u8((uint8_t) ialpha);
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h
index 2a04ba742..7541b1d6f 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h
@@ -69,40 +69,46 @@ static bool _arrange(const SwImage* image, const SwBBox* region, int& yStart, in
 }
 
 
-static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, SwAlpha alpha, AASpans* aaSpans)
+static void _rasterMaskedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans, uint8_t dirFlag = 0)
 {
-#define TEXMAP_TRANSLUCENT
-#define TEXMAP_MASKING
-      #include "tvgSwRasterTexmapInternal.h"
-#undef TEXMAP_MASKING
-#undef TEXMAP_TRANSLUCENT
+    auto method = surface->compositor->method;
+
+    if (method == CompositeMethod::AddMask) {
+        #define TEXMAP_ADD_MASK
+            #include "tvgSwRasterMaskedTexmapInternal.h"
+        #undef TEXMAP_ADD_MASK
+    } else if (method == CompositeMethod::SubtractMask) {
+        #define TEXMAP_SUB_MASK
+            #include "tvgSwRasterMaskedTexmapInternal.h"
+        #undef TEXMAP_SUB_MASK
+    } else if (method == CompositeMethod::IntersectMask) {
+        #define TEXMAP_INT_MASK
+            #include "tvgSwRasterMaskedTexmapInternal.h"
+        #undef TEXMAP_INT_MASK
+    } else if (method == CompositeMethod::DifferenceMask) {
+        #define TEXMAP_DIF_MASK
+            #include "tvgSwRasterMaskedTexmapInternal.h"
+        #undef TEXMAP_DIF_MASK
+    }
 }
 
 
-static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, SwAlpha alpha, AASpans* aaSpans)
+static void _rasterMattedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans)
 {
-#define TEXMAP_MASKING
-    #include "tvgSwRasterTexmapInternal.h"
-#undef TEXMAP_MASKING
+#define TEXMAP_MATTING
+      #include "tvgSwRasterTexmapInternal.h"
+#undef TEXMAP_MATTING
 }
 
 
 static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans)
-{
-#define TEXMAP_TRANSLUCENT
-     #include "tvgSwRasterTexmapInternal.h"
-#undef TEXMAP_TRANSLUCENT
-}
-
-
-static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans)
 {
     #include "tvgSwRasterTexmapInternal.h"
 }
 
 
 /* This mapping algorithm is based on Mikael Kalms's. */
-static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, uint32_t opacity, Polygon& polygon, SwAlpha alpha, AASpans* aaSpans)
+static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, uint32_t opacity, Polygon& polygon, AASpans* aaSpans)
 {
     float x[3] = {polygon.vertex[0].pt.x, polygon.vertex[1].pt.x, polygon.vertex[2].pt.x};
     float y[3] = {polygon.vertex[0].pt.y, polygon.vertex[1].pt.y, polygon.vertex[2].pt.y};
@@ -165,6 +171,7 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const
     if (mathEqual(y[1], y[2])) side = x[2] > x[1];
 
     auto regionTop = region ? region->min.y : image->rle->spans->y;  //Normal Image or Rle Image?
+    auto compositing = _compositing(surface);   //Composition required
 
     //Longer edge is on the left side
     if (!side) {
@@ -190,13 +197,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const
             dxdyb = dxdy[0];
             xb = x[0] + dy * dxdyb + (off_y * dxdyb);
 
-            if (alpha) {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], alpha, aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, alpha, aaSpans);
-            } else {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
-            }
+            if (compositing) {
+                if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans, 1);
+            } else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
 
             upper = true;
         }
@@ -211,13 +215,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const
             // Set right edge X-slope and perform subpixel pre-stepping
             dxdyb = dxdy[2];
             xb = x[1] + (1 - (y[1] - yi[1])) * dxdyb + (off_y * dxdyb);
-            if (alpha) {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], alpha, aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, alpha, aaSpans);
-            } else {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
-            }
+            if (compositing) {
+                if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans, 2);
+            } else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
         }
     //Longer edge is on the right side
     } else {
@@ -240,13 +241,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const
             ua = u[0] + dy * dudya + (off_y * dudya);
             va = v[0] + dy * dvdya + (off_y * dvdya);
 
-            if (alpha) {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], alpha, aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, alpha, aaSpans);
-            } else {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
-            }
+            if (compositing) {
+                if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans, 3);
+            } else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans);
 
             upper = true;
         }
@@ -264,13 +262,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const
             ua = u[1] + dy * dudya + (off_y * dudya);
             va = v[1] + dy * dvdya + (off_y * dvdya);
 
-            if (alpha) {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], alpha, aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, alpha, aaSpans);
-            } else {
-                if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans);
-                else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
-            }
+            if (compositing) {
+                if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans, 4);
+            } else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans);
         }
     }
 }
@@ -508,7 +503,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans)
 
             pos = 1;
             while (pos <= line->length[0]) {
-                *dst = INTERPOLATE((line->coverage[0] * pos), *dst, pixel);
+                *dst = INTERPOLATE(*dst, pixel, line->coverage[0] * pos);
                 ++dst;
                 ++pos;
             }
@@ -520,7 +515,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans)
 
             pos = width;
             while ((int32_t)(width - line->length[1]) < pos) {
-                *dst = INTERPOLATE(255 - (line->coverage[1] * (line->length[1] - (width - pos))), *dst, pixel);
+                *dst = INTERPOLATE(*dst, pixel, 255 - (line->coverage[1] * (line->length[1] - (width - pos))));
                 --dst;
                 --pos;
             }
@@ -545,7 +540,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans)
     | /  |
     3 -- 2
 */
-static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox* region, uint32_t opacity, SwAlpha alpha)
+static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox* region, uint32_t opacity)
 {
     //Exceptions: No dedicated drawing area?
     if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false;
@@ -576,14 +571,14 @@ static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const
     polygon.vertex[1] = vertices[1];
     polygon.vertex[2] = vertices[3];
 
-    _rasterPolygonImage(surface, image, region, opacity, polygon, alpha, aaSpans);
+    _rasterPolygonImage(surface, image, region, opacity, polygon, aaSpans);
 
     //Draw the second polygon
     polygon.vertex[0] = vertices[1];
     polygon.vertex[1] = vertices[2];
     polygon.vertex[2] = vertices[3];
 
-    _rasterPolygonImage(surface, image, region, opacity, polygon, alpha, aaSpans);
+    _rasterPolygonImage(surface, image, region, opacity, polygon, aaSpans);
 
     return _apply(surface, aaSpans);
 }
@@ -602,7 +597,7 @@ static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const
       Should provide two Polygons, one for each triangle.
       // TODO: region?
 */
-static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity, SwAlpha alpha)
+static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity)
 {
     //Exceptions: No dedicated drawing area?
     if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false;
@@ -636,7 +631,7 @@ static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, c
     auto aaSpans = _AASpans(ys, ye, image, region);
     if (aaSpans) {
         for (uint32_t i = 0; i < mesh->triangleCnt; i++) {
-            _rasterPolygonImage(surface, image, region, opacity, transformedTris[i], alpha, aaSpans);
+            _rasterPolygonImage(surface, image, region, opacity, transformedTris[i], aaSpans);
         }
         // Apply to surface (note: frees the AA spans)
         _apply(surface, aaSpans);
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
index 92e8c1b35..bfa7db218 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
@@ -36,9 +36,9 @@
     uint32_t* buf;
     SwSpan* span = nullptr;         //used only when rle based.
 
-#ifdef TEXMAP_MASKING
-    uint8_t* cmp;
+#ifdef TEXMAP_MATTING
     auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->blender.alpha(surface->compositor->method);
 #endif
 
     if (!_arrange(image, region, yStart, yEnd)) return;
@@ -82,75 +82,121 @@
         if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
         if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
 
-        //Range exception
-        if ((x2 - x1) < 1 || (x1 >= maxx) || (x2 <= minx)) goto next;
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
 
-        //Perform subtexel pre-stepping on UV
-        dx = 1 - (_xa - x1);
-        u = _ua + dx * _dudx;
-        v = _va + dx * _dvdx;
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
 
-        buf = dbuf + ((y * dw) + x1);
+            buf = dbuf + ((y * dw) + x1);
 
-        x = x1;
+            x = x1;
 
-#ifdef TEXMAP_MASKING
-        cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize];
+#ifdef TEXMAP_MATTING
+            auto cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize];
 #endif
-        //Draw horizontal line
-        while (x++ < x2) {
-            uu = (int) u;
-            vv = (int) v;
-
-            ar = (int)(255 * (1 - modff(u, &iptr)));
-            ab = (int)(255 * (1 - modff(v, &iptr)));
-            iru = uu + 1;
-            irv = vv + 1;
-
-            if (vv >= sh) continue;
-
-            px = *(sbuf + (vv * sw) + uu);
-
-            /* horizontal interpolate */
-            if (iru < sw) {
-                /* right pixel */
-                int px2 = *(sbuf + (vv * sw) + iru);
-                px = INTERPOLATE(ar, px, px2);
-            }
-            /* vertical interpolate */
-            if (irv < sh) {
-                /* bottom pixel */
-                int px2 = *(sbuf + (irv * sw) + uu);
-
-                /* horizontal interpolate */
-                if (iru < sw) {
-                    /* bottom right pixel */
-                    int px3 = *(sbuf + (irv * sw) + iru);
-                    px2 = INTERPOLATE(ar, px2, px3);
-                }
-                px = INTERPOLATE(ab, px, px2);
-            }
-#if defined(TEXMAP_MASKING) && defined(TEXMAP_TRANSLUCENT)
-            auto src = ALPHA_BLEND(px, _multiply<uint32_t>(opacity, alpha(cmp)));
-#elif defined(TEXMAP_MASKING)
-            auto src = ALPHA_BLEND(px, alpha(cmp));
-#elif defined(TEXMAP_TRANSLUCENT)
-            auto src = ALPHA_BLEND(px, opacity);
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    if (vv >= sh) continue;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+#ifdef TEXMAP_MATTING
+                    auto src = ALPHA_BLEND(px, alpha(cmp));
+                    cmp += csize;
 #else
-            auto src = px;
+                    auto src = px;
 #endif
-            *buf = src + ALPHA_BLEND(*buf, _ialpha(src));
-            ++buf;
-#ifdef TEXMAP_MASKING
-            cmp += csize;
+                    *buf = src + ALPHA_BLEND(*buf, IALPHA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    if (vv >= sh) continue;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+#ifdef TEXMAP_MATTING
+                    auto src = ALPHA_BLEND(px, MULTIPLY(opacity, alpha(cmp)));
+                    cmp += csize;
+#else
+                    auto src = ALPHA_BLEND(px, opacity);
 #endif
-            //Step UV horizontally
-            u += _dudx;
-            v += _dvdx;
-            //range over?
-            if ((uint32_t)v >= image->h) break;
+                    *buf = src + ALPHA_BLEND(*buf, IALPHA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            }
         }
-next:
         //Step along both edges
         _xa += _dxdya;
         _xb += _dxdyb;
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp b/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp
index 5746fe49a..450a221d6 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp
+++ b/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp
@@ -166,6 +166,7 @@ bool Paint::Impl::render(RenderMethod& renderer)
         Create a composition image. */
     if (compData && compData->method != CompositeMethod::ClipPath && !(compData->target->pImpl->ctxFlag & ContextFlag::FastTrack)) {
         auto region = smethod->bounds(renderer);
+        if (MASK_OPERATION(compData->method)) region.add(compData->target->pImpl->smethod->bounds(renderer));
         if (region.w == 0 || region.h == 0) return true;
         cmp = renderer.target(region, COMPOSITE_TO_COLORSPACE(renderer, compData->method));
         if (renderer.beginComposite(cmp, CompositeMethod::None, 255)) {
@@ -209,11 +210,18 @@ RenderData Paint::Impl::update(RenderMethod& renderer, const RenderTransform* pT
         auto tryFastTrack = false;
         if (target->identifier() == TVG_CLASS_ID_SHAPE) {
             if (method == CompositeMethod::ClipPath) tryFastTrack = true;
+            //OPTIMIZE HERE: Actually, this condition AlphaMask is useless. We can skip it?
             else if (method == CompositeMethod::AlphaMask) {
                 auto shape = static_cast<Shape*>(target);
                 uint8_t a;
                 shape->fillColor(nullptr, nullptr, nullptr, &a);
                 if (a == 255 && shape->opacity() == 255 && !shape->fill()) tryFastTrack = true;
+            //OPTIMIZE HERE: Actually, this condition InvAlphaMask is useless. We can skip it?
+            } else if (method == CompositeMethod::InvAlphaMask) {
+                auto shape = static_cast<Shape*>(target);
+                uint8_t a;
+                shape->fillColor(nullptr, nullptr, nullptr, &a);
+                if ((a == 0 || shape->opacity() == 0) && !shape->fill()) tryFastTrack = true;
             }
             if (tryFastTrack) {
                 RenderRegion viewport2;
diff --git a/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h b/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h
index e4445c579..62be27df9 100644
--- a/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h
+++ b/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h
@@ -98,6 +98,20 @@ struct RenderRegion
         if (w < 0) w = 0;
         if (h < 0) h = 0;
     }
+
+    void add(const RenderRegion& rhs)
+    {
+        if (rhs.x < x) {
+            w += (x - rhs.x);
+            x = rhs.x;
+        }
+        if (rhs.y < y) {
+            h += (y - rhs.y);
+            y = rhs.y;
+        }
+        if (rhs.x + rhs.w > x + w) w = (rhs.x + rhs.w) - x;
+        if (rhs.y + rhs.h > y + h) h = (rhs.y + rhs.h) - y;
+    }
 };
 
 struct RenderTransform
@@ -238,6 +252,25 @@ class RenderMethod
     virtual bool endComposite(Compositor* cmp) = 0;
 };
 
+static inline bool MASK_OPERATION(CompositeMethod method)
+{
+    switch(method) {
+        case CompositeMethod::AlphaMask:
+        case CompositeMethod::InvAlphaMask:
+        case CompositeMethod::LumaMask:
+        case CompositeMethod::InvLumaMask:
+            return false;
+        case CompositeMethod::AddMask:
+        case CompositeMethod::SubtractMask:
+        case CompositeMethod::IntersectMask:
+        case CompositeMethod::DifferenceMask:
+            return true;
+        default:
+            TVGERR("COMMON", "Unsupported Composite Size! = %d", (int)method);
+            return false;
+    }
+}
+
 static inline uint8_t CHANNEL_SIZE(ColorSpace cs)
 {
     switch(cs) {
@@ -263,6 +296,10 @@ static inline ColorSpace COMPOSITE_TO_COLORSPACE(RenderMethod& renderer, Composi
             return ColorSpace::Grayscale8;
         case CompositeMethod::LumaMask:
         case CompositeMethod::InvLumaMask:
+        case CompositeMethod::AddMask:
+        case CompositeMethod::SubtractMask:
+        case CompositeMethod::IntersectMask:
+        case CompositeMethod::DifferenceMask:
             return renderer.colorSpace();
         default:
             TVGERR("COMMON", "Unsupported Composite Size! = %d", (int)method);
diff --git a/other_tool/getter-setter-fix-plugin-gradle/build.gradle b/other_tool/getter-setter-fix-plugin-gradle/build.gradle
index 14a7553ed..4d31a3174 100644
--- a/other_tool/getter-setter-fix-plugin-gradle/build.gradle
+++ b/other_tool/getter-setter-fix-plugin-gradle/build.gradle
@@ -1,6 +1,6 @@
 plugins {
     id("java")
-    id("kotlin")
+    id("org.jetbrains.kotlin.jvm") version "1.8.21"
     id("org.jetbrains.intellij") version "1.9.0"
 }
 
@@ -14,7 +14,7 @@ repositories {
 }
 
 intellij {
-    version = "2022.2.1"
+    version = "2022.3.1"
     type = "IC"
     plugins = ['android']
 }
diff --git a/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip b/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip
index 610667b53..9e2d0961d 100644
Binary files a/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip and b/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip differ
diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle.properties b/other_tool/getter-setter-fix-plugin-gradle/gradle.properties
new file mode 100644
index 000000000..21554ff12
--- /dev/null
+++ b/other_tool/getter-setter-fix-plugin-gradle/gradle.properties
@@ -0,0 +1,14 @@
+## For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+#
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+# Default value: -Xmx1024m -XX:MaxPermSize=256m
+# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8
+#
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+#Tue May 30 17:05:45 MSK 2023
+org.gradle.jvmargs=-Xmx2048M -Dkotlin.daemon.jvm.options\="-Xmx2048M"
diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 000000000..249e5832f
Binary files /dev/null and b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 000000000..aba097b06
--- /dev/null
+++ b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,5 @@
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.1-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
\ No newline at end of file
diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradlew b/other_tool/getter-setter-fix-plugin-gradle/gradlew
new file mode 100644
index 000000000..a69d9cb6c
--- /dev/null
+++ b/other_tool/getter-setter-fix-plugin-gradle/gradlew
@@ -0,0 +1,240 @@
+#!/bin/sh
+
+#
+# Copyright © 2015-2021 the original authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+#
+#   Gradle start up script for POSIX generated by Gradle.
+#
+#   Important for running:
+#
+#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+#       noncompliant, but you have some other compliant shell such as ksh or
+#       bash, then to run this script, type that shell name before the whole
+#       command line, like:
+#
+#           ksh Gradle
+#
+#       Busybox and similar reduced shells will NOT work, because this script
+#       requires all of these POSIX shell features:
+#         * functions;
+#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+#         * compound commands having a testable exit status, especially «case»;
+#         * various built-in commands including «command», «set», and «ulimit».
+#
+#   Important for patching:
+#
+#   (2) This script targets any POSIX shell, so it avoids extensions provided
+#       by Bash, Ksh, etc; in particular arrays are avoided.
+#
+#       The "traditional" practice of packing multiple parameters into a
+#       space-separated string is a well documented source of bugs and security
+#       problems, so this is (mostly) avoided, by progressively accumulating
+#       options in "$@", and eventually passing that to Java.
+#
+#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+#       see the in-line comments for details.
+#
+#       There are tweaks for specific operating systems such as AIX, CygWin,
+#       Darwin, MinGW, and NonStop.
+#
+#   (3) This script is generated from the Groovy template
+#       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+#       within the Gradle project.
+#
+#       You can find Gradle at https://github.com/gradle/gradle/.
+#
+##############################################################################
+
+# Attempt to set APP_HOME
+
+# Resolve links: $0 may be a link
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
+    [ -h "$app_path" ]
+do
+    ls=$( ls -ld "$app_path" )
+    link=${ls#*' -> '}
+    case $link in             #(
+      /*)   app_path=$link ;; #(
+      *)    app_path=$APP_HOME$link ;;
+    esac
+done
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
+
+APP_NAME="Gradle"
+APP_BASE_NAME=${0##*/}
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD=maximum
+
+warn () {
+    echo "$*"
+} >&2
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+} >&2
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "$( uname )" in                #(
+  CYGWIN* )         cygwin=true  ;; #(
+  Darwin* )         darwin=true  ;; #(
+  MSYS* | MINGW* )  msys=true    ;; #(
+  NONSTOP* )        nonstop=true ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD=$JAVA_HOME/jre/sh/java
+    else
+        JAVACMD=$JAVA_HOME/bin/java
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD=java
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+    case $MAX_FD in #(
+      max*)
+        MAX_FD=$( ulimit -H -n ) ||
+            warn "Could not query maximum file descriptor limit"
+    esac
+    case $MAX_FD in  #(
+      '' | soft) :;; #(
+      *)
+        ulimit -n "$MAX_FD" ||
+            warn "Could not set maximum file descriptor limit to $MAX_FD"
+    esac
+fi
+
+# Collect all arguments for the java command, stacking in reverse order:
+#   * args from the command line
+#   * the main class name
+#   * -classpath
+#   * -D...appname settings
+#   * --module-path (only if needed)
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if "$cygwin" || "$msys" ; then
+    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+    JAVACMD=$( cygpath --unix "$JAVACMD" )
+
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    for arg do
+        if
+            case $arg in                                #(
+              -*)   false ;;                            # don't mess with options #(
+              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
+                    [ -e "$t" ] ;;                      #(
+              *)    false ;;
+            esac
+        then
+            arg=$( cygpath --path --ignore --mixed "$arg" )
+        fi
+        # Roll the args list around exactly as many times as the number of
+        # args, so each arg winds up back in the position where it started, but
+        # possibly modified.
+        #
+        # NB: a `for` loop captures its iteration list before it begins, so
+        # changing the positional parameters here affects neither the number of
+        # iterations, nor the values presented in `arg`.
+        shift                   # remove old arg
+        set -- "$@" "$arg"      # push replacement arg
+    done
+fi
+
+# Collect all arguments for the java command;
+#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+#     shell script including quotes and variable substitutions, so put them in
+#     double quotes to make sure that they get re-expanded; and
+#   * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+        "-Dorg.gradle.appname=$APP_BASE_NAME" \
+        -classpath "$CLASSPATH" \
+        org.gradle.wrapper.GradleWrapperMain \
+        "$@"
+
+# Stop when "xargs" is not available.
+if ! command -v xargs >/dev/null 2>&1
+then
+    die "xargs is not available"
+fi
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+#   set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
+
+eval "set -- $(
+        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+        xargs -n1 |
+        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+        tr '\n' ' '
+    )" '"$@"'
+
+exec "$JAVACMD" "$@"
diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat b/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat
new file mode 100644
index 000000000..f127cfd49
--- /dev/null
+++ b/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat
@@ -0,0 +1,91 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%"=="" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%"=="" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if %ERRORLEVEL% equ 0 goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if %ERRORLEVEL% equ 0 goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+set EXIT_CODE=%ERRORLEVEL%
+if %EXIT_CODE% equ 0 set EXIT_CODE=1
+if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
+exit /b %EXIT_CODE%
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/other_tool/kspcomplier/build.gradle.kts b/other_tool/kspcomplier/build.gradle.kts
index 781aecc72..4caca8de8 100644
--- a/other_tool/kspcomplier/build.gradle.kts
+++ b/other_tool/kspcomplier/build.gradle.kts
@@ -4,7 +4,7 @@ plugins {
 }
 
 group = "dev.umerov.ksp"
-version = "1.0-SNAPSHOT"
+version = "1.0"
 
 repositories {
     mavenCentral()
@@ -13,10 +13,10 @@ repositories {
 
 dependencies {
     implementation(kotlin("stdlib"))
-    implementation("androidx.annotation:annotation:1.4.0")
-    implementation("org.jetbrains.kotlinx:kotlinx-serialization-core:1.4.0-RC")
+    implementation("androidx.annotation:annotation:1.6.0")
+    implementation("org.jetbrains.kotlinx:kotlinx-serialization-core:1.5.1")
 
-    implementation("com.google.devtools.ksp:symbol-processing-api:1.7.0-1.0.6")
+    implementation("com.google.devtools.ksp:symbol-processing-api:1.9.0-Beta-1.0.11")
 }
 
 sourceSets.main {
diff --git a/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt b/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt
index baf43f3b5..1115f1036 100644
--- a/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt
+++ b/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt
@@ -16,7 +16,13 @@
 package com.squareup.picasso3
 
 import android.content.Context
-import android.graphics.*
+import android.graphics.Bitmap
+import android.graphics.Canvas
+import android.graphics.Color
+import android.graphics.ColorFilter
+import android.graphics.Paint
+import android.graphics.Path
+import android.graphics.Rect
 import android.graphics.drawable.Animatable
 import android.graphics.drawable.BitmapDrawable
 import android.graphics.drawable.Drawable
diff --git a/viewpager2/build.gradle b/viewpager2/build.gradle
index 68b9b1e7b..24906499a 100644
--- a/viewpager2/build.gradle
+++ b/viewpager2/build.gradle
@@ -27,8 +27,10 @@ android {
     }
 
     gradle.projectsEvaluated {
-        tasks.withType(JavaCompile) {
-            options.compilerArgs << "-Xlint:deprecation"
+        tasks.withType(JavaCompile).tap {
+            configureEach {
+                options.compilerArgs << "-Xlint:deprecation"
+            }
         }
     }
 }
diff --git a/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java b/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java
index 198b70f95..5bfee529a 100644
--- a/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java
+++ b/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java
@@ -20,6 +20,8 @@
 import android.view.View;
 import android.view.ViewGroup;
 
+import androidx.annotation.NonNull;
+
 import java.util.ArrayList;
 import java.util.List;
 
@@ -358,9 +360,10 @@ void unhide(View view) {
         unhideViewInternal(view);
     }
 
+    @NonNull
     @Override
     public String toString() {
-        return mBucket.toString() + ", hidden list:" + mHiddenViews.size();
+        return mBucket + ", hidden list:" + mHiddenViews.size();
     }
 
     /**
@@ -503,10 +506,11 @@ int countOnesBefore(int index) {
             }
         }
 
+        @NonNull
         @Override
         public String toString() {
             return mNext == null ? Long.toBinaryString(mData)
-                    : mNext.toString() + "xx" + Long.toBinaryString(mData);
+                    : mNext + "xx" + Long.toBinaryString(mData);
         }
     }