diff --git a/FileGallery.jpg b/FileGallery.jpg index d73a05243..8096f4405 100644 Binary files a/FileGallery.jpg and b/FileGallery.jpg differ diff --git a/README.md b/README.md index ebf0a75f7..51a47f1dd 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ Инструкция по сборке: Требуется: - 1) Android Studio Flamingo (2022.2.1) или выше. Kotlin 1.8.* + 1) Android Studio Giraffe (2022.3.1) или выше. Kotlin 1.9.* 2) Android SDK 33 - 3) Android NDK 25.1.8937393 + 3) Android NDK 25.2.9519653 Если не работает музыка в Fenrir Kate, обновите kate_receipt_gms_token в app.build_config. Взять токен можно из Kate Mobile Extra Mod diff --git a/app_fenrir/build.gradle b/app_fenrir/build.gradle index 67f4bff3d..1aa88212e 100644 --- a/app_fenrir/build.gradle +++ b/app_fenrir/build.gradle @@ -52,7 +52,7 @@ android { checkReleaseBuilds = true } - tasks.withType(JavaCompile) { + tasks.withType(JavaCompile).configureEach { options.compilerArgs << "-Xmaxwarns" << "1000" << "-Xmaxerrs" << "1000" } @@ -76,7 +76,6 @@ android { release { minifyEnabled = false shrinkResources = false - zipAlignEnabled = true } debug { minifyEnabled = false @@ -84,7 +83,7 @@ android { } } - flavorDimensions "type" + flavorDimensions.add("type") productFlavors { fenrir { applicationId = "dev.ragnarok.fenrir" diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt index 61ac5a044..d030ddcc9 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/accounts/AccountsPresenter.kt @@ -36,6 +36,7 @@ import dev.ragnarok.fenrir.kJson import dev.ragnarok.fenrir.longpoll.LongpollInstance import dev.ragnarok.fenrir.model.Account import dev.ragnarok.fenrir.model.IOwnersBundle +import dev.ragnarok.fenrir.model.MessageStatus import dev.ragnarok.fenrir.model.SaveAccount import dev.ragnarok.fenrir.model.User import dev.ragnarok.fenrir.model.criteria.DialogsCriteria @@ -507,13 +508,19 @@ class AccountsPresenter(savedInstanceState: Bundle?) : val dialogsJsonElem = i.jsonObject["conversation"]?.jsonArray ?: continue if (!dialogsJsonElem.isEmpty()) { - Includes.stores.dialogs().insertDialogs( - aid, kJson.decodeFromJsonElement( - ListSerializer( - DialogDboEntity.serializer() - ), dialogsJsonElem - ), true - ).blockingAwait() + val btmp = kJson.decodeFromJsonElement( + ListSerializer( + DialogDboEntity.serializer() + ), dialogsJsonElem + ) + if (btmp.nonNullNoEmpty()) { + for (o in btmp) { + o.message?.setStatus(MessageStatus.SENT) + } + Includes.stores.dialogs().insertDialogs( + aid, btmp, true + ).blockingAwait() + } } } } diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt index 6c35aec93..da02b5a12 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/base/RecyclerBindableAdapter.kt @@ -204,8 +204,10 @@ abstract class RecyclerBindableAdapter(private } //empty out our FrameLayout and replace with our header/footer - (vh.itemView as ViewGroup).removeAllViews() - vh.itemView.addView(view) + (vh.itemView as ViewGroup).let { + it.removeAllViews() + it.addView(view) + } } private fun isHeader(position: Int): Boolean { diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt index ffec22acc..1ba67e6df 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/messages/chat/ChatFragment.kt @@ -6,6 +6,7 @@ import android.annotation.SuppressLint import android.app.Activity.RESULT_OK import android.app.Dialog import android.content.* +import android.graphics.Bitmap import android.net.* import android.os.Build import android.os.Bundle @@ -1570,7 +1571,12 @@ class ChatFragment : PlaceSupportMvpFragment(), IChatV it, Uri.fromFile(File(requireActivity().externalCacheDir.toString() + File.separator + "scale.jpg")) ) - .withAspectRatio(1f, 1f) + .withOptions( + UCrop.Options().withAspectRatio(1f, 1f) + .setCompressionQuality(100) + .setCompressionFormat(Bitmap.CompressFormat.JPEG) + .setHideBottomControls(false) + ) .getIntent(requireActivity()) } ) diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt index a04509a42..8ac15af91 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/fragment/wall/userwall/UserWallFragment.kt @@ -92,7 +92,11 @@ class UserWallFragment : AbsWallFragment(), IU to_up, Uri.fromFile(File(requireActivity().externalCacheDir.toString() + File.separator + "scale.jpg")) ) - .withAspectRatio(1f, 1f) + .withOptions( + UCrop.Options().withAspectRatio(1f, 1f).setCompressionQuality(100) + .setCompressionFormat(Bitmap.CompressFormat.JPEG) + .setHideBottomControls(false) + ) .getIntent(requireActivity()) ) } diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt index 51a0ca6c8..351c8b1d2 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/longpoll/NotificationHelper.kt @@ -650,7 +650,7 @@ object NotificationHelper { return if (urit != null) Content(MimeType, urit) else null } - @SuppressLint("RestrictedApi") + @SuppressLint("RestrictedApi", "ReportShortcutUsage") private fun createNotificationShortcut( context: Context, builder: NotificationCompat.Builder, diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt index f7c1f06a7..64e492f10 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackController.kt @@ -10,6 +10,7 @@ import android.content.IntentFilter import android.content.ServiceConnection import android.os.IBinder import android.os.RemoteException +import androidx.core.content.ContextCompat import dev.ragnarok.fenrir.R import dev.ragnarok.fenrir.model.Audio import dev.ragnarok.fenrir.settings.Settings @@ -55,7 +56,12 @@ object MusicPlaybackController { filter.addAction(MusicPlaybackService.META_CHANGED) filter.addAction(MusicPlaybackService.PREPARED) filter.addAction(MusicPlaybackService.QUEUE_CHANGED) - appContext.registerReceiver(receiver, filter) + ContextCompat.registerReceiver( + appContext, + receiver, + filter, + ContextCompat.RECEIVER_NOT_EXPORTED + ) } fun bindToServiceWithoutStart( diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt index d6c538eeb..cd6ab7cde 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/media/music/MusicPlaybackService.kt @@ -18,6 +18,7 @@ import android.support.v4.media.session.MediaControllerCompat import android.support.v4.media.session.MediaSessionCompat import android.support.v4.media.session.PlaybackStateCompat import android.util.Log +import androidx.core.content.ContextCompat import androidx.media.session.MediaButtonReceiver import androidx.media3.common.AudioAttributes import androidx.media3.common.C @@ -157,7 +158,12 @@ class MusicPlaybackService : Service() { filter.addAction(PREVIOUS_ACTION) filter.addAction(REPEAT_ACTION) filter.addAction(SHUFFLE_ACTION) - registerReceiver(mIntentReceiver, filter) + ContextCompat.registerReceiver( + this, + mIntentReceiver, + filter, + ContextCompat.RECEIVER_NOT_EXPORTED + ) // Initialize the delayed shutdown intent val shutdownIntent = Intent(this, MusicPlaybackService::class.java) diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt index 38633a3b7..60aeb74e1 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/ShortcutUtils.kt @@ -1,5 +1,6 @@ package dev.ragnarok.fenrir.util +import android.annotation.SuppressLint import android.annotation.TargetApi import android.content.Context import android.content.Intent @@ -216,6 +217,7 @@ object ShortcutUtils { } + @SuppressLint("ReportShortcutUsage") @TargetApi(Build.VERSION_CODES.N_MR1) fun addDynamicShortcut(context: Context, accountId: Long, peer: Peer): Completable { val app = context.applicationContext diff --git a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt index 0fb1b232e..2a80e6a0e 100644 --- a/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt +++ b/app_fenrir/src/main/kotlin/dev/ragnarok/fenrir/util/Utils.kt @@ -1002,7 +1002,7 @@ object Utils { return } if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - drawable.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE) + drawable.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY) } else { drawable.setColorFilter(color, PorterDuff.Mode.MULTIPLY) } @@ -1013,7 +1013,7 @@ object Utils { return } if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE) + view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY) } else { view.setColorFilter(color, PorterDuff.Mode.MULTIPLY) } @@ -1024,7 +1024,7 @@ object Utils { return } if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE) + view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY) } else { view.setColorFilter(color, PorterDuff.Mode.MULTIPLY) } diff --git a/app_filegallery/build.gradle b/app_filegallery/build.gradle index d0b5794b0..b64af2cf7 100644 --- a/app_filegallery/build.gradle +++ b/app_filegallery/build.gradle @@ -51,7 +51,7 @@ android { checkReleaseBuilds = true } - tasks.withType(JavaCompile) { + tasks.withType(JavaCompile).configureEach { options.compilerArgs << "-Xmaxwarns" << "1000" << "-Xmaxerrs" << "1000" } @@ -75,7 +75,6 @@ android { release { minifyEnabled = false shrinkResources = false - zipAlignEnabled = true } debug { minifyEnabled = false diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt index 60da9a43b..ff16202b1 100644 --- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt +++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/fragment/PreferencesFragment.kt @@ -802,7 +802,7 @@ class PreferencesFragment : AbsPreferencesFragment(), PreferencesAdapter.OnScree ".SLOGAN" ).absolutePath || sel.absolutePath == File( Environment.getExternalStorageDirectory(), - ".OplusOS" + "OplusOS" ).absolutePath || sel.absolutePath == File( Environment.getExternalStorageDirectory(), ".time" diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt index 20db5aa65..b99febcf6 100644 --- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt +++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackController.kt @@ -10,6 +10,7 @@ import android.content.IntentFilter import android.content.ServiceConnection import android.os.IBinder import android.os.RemoteException +import androidx.core.content.ContextCompat import dev.ragnarok.filegallery.R import dev.ragnarok.filegallery.model.Audio import dev.ragnarok.filegallery.settings.Settings @@ -56,7 +57,12 @@ object MusicPlaybackController { filter.addAction(MusicPlaybackService.META_CHANGED) filter.addAction(MusicPlaybackService.PREPARED) filter.addAction(MusicPlaybackService.QUEUE_CHANGED) - appContext.registerReceiver(receiver, filter) + ContextCompat.registerReceiver( + appContext, + receiver, + filter, + ContextCompat.RECEIVER_NOT_EXPORTED + ) } fun bindToServiceWithoutStart( diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt index 613383797..df560de11 100644 --- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt +++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/media/music/MusicPlaybackService.kt @@ -18,6 +18,7 @@ import android.support.v4.media.session.MediaControllerCompat import android.support.v4.media.session.MediaSessionCompat import android.support.v4.media.session.PlaybackStateCompat import android.util.Log +import androidx.core.content.ContextCompat import androidx.media.session.MediaButtonReceiver import androidx.media3.common.AudioAttributes import androidx.media3.common.C @@ -150,7 +151,12 @@ class MusicPlaybackService : Service() { filter.addAction(PREVIOUS_ACTION) filter.addAction(REPEAT_ACTION) filter.addAction(SHUFFLE_ACTION) - registerReceiver(mIntentReceiver, filter) + ContextCompat.registerReceiver( + this, + mIntentReceiver, + filter, + ContextCompat.RECEIVER_NOT_EXPORTED + ) // Initialize the delayed shutdown intent val shutdownIntent = Intent(this, MusicPlaybackService::class.java) diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt index dbaa92650..66f87e98c 100644 --- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt +++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/Utils.kt @@ -191,7 +191,7 @@ object Utils { fun setColorFilter(view: ImageView?, @ColorInt color: Int) { if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - view?.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE) + view?.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY) } else { view?.setColorFilter(color, PorterDuff.Mode.MULTIPLY) } @@ -202,7 +202,7 @@ object Utils { return } if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { - view.colorFilter = BlendModeColorFilter(color, BlendMode.MODULATE) + view.colorFilter = PorterDuffColorFilter(color, PorterDuff.Mode.MULTIPLY) } else { view.setColorFilter(color, PorterDuff.Mode.MULTIPLY) } diff --git a/build.gradle b/build.gradle index 6754a633e..3d256c3e1 100644 --- a/build.gradle +++ b/build.gradle @@ -41,7 +41,7 @@ buildscript { ext.tracingVersion = "1.2.0-rc01" ext.transitionVersion = "1.4.1" ext.vectordrawableVersion = "1.2.0-beta01" - ext.webkitVersion = "1.7.0-rc01" + ext.webkitVersion = "1.7.0" ext.workVersion = "2.8.1" //firebase libraries @@ -57,7 +57,7 @@ buildscript { ext.autoValueVersion = "1.10.1" //common libraries - ext.kotlin_version = "1.8.21" + ext.kotlin_version = "1.9.0-Beta" ext.kotlin_coroutines = "1.7.1" ext.kotlin_serializer = "1.5.1" ext.okhttpLibraryVersion = "5.0.0-alpha.11" @@ -66,7 +66,7 @@ buildscript { ext.guavaVersion = "32.0.0-android" ext.errorproneVersion = "2.15.0" ext.checkerCompatQualVersion = "2.5.5" - ext.checkerQualAndroidVersion = "3.34.0" + ext.checkerQualAndroidVersion = "3.35.0" ext.desugarLibraryVersion = "2.0.3" //APP_PROPS @@ -91,7 +91,7 @@ buildscript { mavenCentral() } dependencies { - classpath "com.android.tools.build:gradle:8.0.2" + classpath "com.android.tools.build:gradle:8.1.0-beta04" classpath "com.google.gms:google-services:4.3.15" classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" classpath "org.jetbrains.kotlin:kotlin-serialization:$kotlin_version" @@ -106,6 +106,6 @@ allprojects { } } -task clean(type: Delete) { +tasks.register('clean', Delete) { delete rootProject.buildDir } diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index e708b1c02..249e5832f 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradlew b/gradlew index 4f906e0c8..a69d9cb6c 100644 --- a/gradlew +++ b/gradlew @@ -1,7 +1,7 @@ -#!/usr/bin/env sh +#!/bin/sh # -# Copyright 2015 the original author or authors. +# Copyright © 2015-2021 the original authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,67 +17,101 @@ # ############################################################################## -## -## Gradle start up script for UN*X -## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# ############################################################################## # Attempt to set APP_HOME + # Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` +APP_BASE_NAME=${0##*/} # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" +MAX_FD=maximum warn () { echo "$*" -} +} >&2 die () { echo echo "$*" echo exit 1 -} +} >&2 # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar @@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACMD=$JAVA_HOME/jre/sh/java else - JAVACMD="$JAVA_HOME/bin/java" + JAVACMD=$JAVA_HOME/bin/java fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD="java" + JAVACMD=java which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the @@ -106,80 +140,101 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac fi -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. # For Cygwin or MSYS, switch paths to Windows format before running java -if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) fi - i=`expr $i + 1` + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg done - case $i in - 0) set -- ;; - 1) set -- "$args0" ;; - 2) set -- "$args0" "$args1" ;; - 3) set -- "$args0" "$args1" "$args2" ;; - 4) set -- "$args0" "$args1" "$args2" "$args3" ;; - 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac fi -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=`save "$@"` +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat index 107acd32c..f127cfd49 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -14,7 +14,7 @@ @rem limitations under the License. @rem -@if "%DEBUG%" == "" @echo off +@if "%DEBUG%"=="" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @@ -25,7 +25,7 @@ if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. +if "%DIRNAME%"=="" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto execute +if %ERRORLEVEL% equ 0 goto execute echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. @@ -75,13 +75,15 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar :end @rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd +if %ERRORLEVEL% equ 0 goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% :mainEnd if "%OS%"=="Windows_NT" endlocal diff --git a/image/src/main/AndroidManifest.xml b/image/src/main/AndroidManifest.xml index 28cbe9a85..c3022eb47 100644 --- a/image/src/main/AndroidManifest.xml +++ b/image/src/main/AndroidManifest.xml @@ -1,7 +1,7 @@ - + = Build.VERSION_CODES.TIRAMISU) { + return intent.getParcelableExtra(EXTRA_OUTPUT_URI, Uri.class); + } else { + return intent.getParcelableExtra(EXTRA_OUTPUT_URI); + } } /** @@ -110,9 +117,14 @@ public static float getOutputCropAspectRatio(@NonNull Intent intent) { * @param result crop result Intent * @return Throwable that could happen while image processing */ + @SuppressWarnings("deprecation") @Nullable public static Throwable getError(@NonNull Intent result) { - return (Throwable) result.getSerializableExtra(EXTRA_ERROR); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) { + return result.getSerializableExtra(EXTRA_ERROR, Throwable.class); + } else { + return (Throwable) result.getSerializableExtra(EXTRA_ERROR); + } } /** @@ -174,20 +186,6 @@ public Intent getIntent(@NonNull Context context) { return mCropIntent; } - /** - * Get Fragment {@link UCropFragment} - * - * @return Fragment of {@link UCropFragment} - */ - public UCropFragment getFragment() { - return UCropFragment.newInstance(mCropOptionsBundle); - } - - public UCropFragment getFragment(Bundle bundle) { - mCropOptionsBundle = bundle; - return getFragment(); - } - /** * Class that helps to setup advanced configs that are not commonly used. * Use it with method {@link #withOptions(Options)} @@ -250,24 +248,27 @@ public Bundle getOptionBundle() { /** * Set one of {@link android.graphics.Bitmap.CompressFormat} that will be used to save resulting Bitmap. */ - public void setCompressionFormat(@NonNull Bitmap.CompressFormat format) { + public Options setCompressionFormat(@NonNull Bitmap.CompressFormat format) { mOptionBundle.putString(EXTRA_COMPRESSION_FORMAT_NAME, format.name()); + return this; } /** * Set compression quality [0-100] that will be used to save resulting Bitmap. */ - public void setCompressionQuality(@IntRange(from = 0) int compressQuality) { + public Options setCompressionQuality(@IntRange(from = 0) int compressQuality) { mOptionBundle.putInt(EXTRA_COMPRESSION_QUALITY, compressQuality); + return this; } /** * Choose what set of gestures will be enabled on each tab - if any. */ - public void setAllowedGestures(@UCropActivity.GestureTypes int tabScale, - @UCropActivity.GestureTypes int tabRotate, - @UCropActivity.GestureTypes int tabAspectRatio) { + public Options setAllowedGestures(@UCropActivity.GestureTypes int tabScale, + @UCropActivity.GestureTypes int tabRotate, + @UCropActivity.GestureTypes int tabAspectRatio) { mOptionBundle.putIntArray(EXTRA_ALLOWED_GESTURES, new int[]{tabScale, tabRotate, tabAspectRatio}); + return this; } /** @@ -275,8 +276,9 @@ public void setAllowedGestures(@UCropActivity.GestureTypes int tabScale, * * @param maxScaleMultiplier - (minScale * maxScaleMultiplier) = maxScale */ - public void setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false) float maxScaleMultiplier) { + public Options setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false) float maxScaleMultiplier) { mOptionBundle.putFloat(EXTRA_MAX_SCALE_MULTIPLIER, maxScaleMultiplier); + return this; } /** @@ -284,8 +286,9 @@ public void setMaxScaleMultiplier(@FloatRange(from = 1.0, fromInclusive = false) * * @param durationMillis - duration in milliseconds */ - public void setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int durationMillis) { + public Options setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int durationMillis) { mOptionBundle.putInt(EXTRA_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION, durationMillis); + return this; } /** @@ -293,148 +296,169 @@ public void setImageToCropBoundsAnimDuration(@IntRange(from = MIN_SIZE) int dura * * @param maxBitmapSize - size in pixels */ - public void setMaxBitmapSize(@IntRange(from = MIN_SIZE) int maxBitmapSize) { + public Options setMaxBitmapSize(@IntRange(from = MIN_SIZE) int maxBitmapSize) { mOptionBundle.putInt(EXTRA_MAX_BITMAP_SIZE, maxBitmapSize); + return this; } /** * @param color - desired color of dimmed area around the crop bounds */ - public void setDimmedLayerColor(@ColorInt int color) { + public Options setDimmedLayerColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_DIMMED_LAYER_COLOR, color); + return this; } /** * @param isCircle - set it to true if you want dimmed layer to have an circle inside */ - public void setCircleDimmedLayer(boolean isCircle) { + public Options setCircleDimmedLayer(boolean isCircle) { mOptionBundle.putBoolean(EXTRA_CIRCLE_DIMMED_LAYER, isCircle); + return this; } /** * @param show - set to true if you want to see a crop frame rectangle on top of an image */ - public void setShowCropFrame(boolean show) { + public Options setShowCropFrame(boolean show) { mOptionBundle.putBoolean(EXTRA_SHOW_CROP_FRAME, show); + return this; } /** * @param color - desired color of crop frame */ - public void setCropFrameColor(@ColorInt int color) { + public Options setCropFrameColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_CROP_FRAME_COLOR, color); + return this; } /** * @param width - desired width of crop frame line in pixels */ - public void setCropFrameStrokeWidth(@IntRange(from = 0) int width) { + public Options setCropFrameStrokeWidth(@IntRange(from = 0) int width) { mOptionBundle.putInt(EXTRA_CROP_FRAME_STROKE_WIDTH, width); + return this; } /** * @param show - set to true if you want to see a crop grid/guidelines on top of an image */ - public void setShowCropGrid(boolean show) { + public Options setShowCropGrid(boolean show) { mOptionBundle.putBoolean(EXTRA_SHOW_CROP_GRID, show); + return this; } /** * @param count - crop grid rows count. */ - public void setCropGridRowCount(@IntRange(from = 0) int count) { + public Options setCropGridRowCount(@IntRange(from = 0) int count) { mOptionBundle.putInt(EXTRA_CROP_GRID_ROW_COUNT, count); + return this; } /** * @param count - crop grid columns count. */ - public void setCropGridColumnCount(@IntRange(from = 0) int count) { + public Options setCropGridColumnCount(@IntRange(from = 0) int count) { mOptionBundle.putInt(EXTRA_CROP_GRID_COLUMN_COUNT, count); + return this; } /** * @param color - desired color of crop grid/guidelines */ - public void setCropGridColor(@ColorInt int color) { + public Options setCropGridColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_CROP_GRID_COLOR, color); + return this; } /** * @param width - desired width of crop grid lines in pixels */ - public void setCropGridStrokeWidth(@IntRange(from = 0) int width) { + public Options setCropGridStrokeWidth(@IntRange(from = 0) int width) { mOptionBundle.putInt(EXTRA_CROP_GRID_STROKE_WIDTH, width); + return this; } /** * @param color - desired resolved color of the toolbar */ - public void setToolbarColor(@ColorInt int color) { + public Options setToolbarColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_TOOL_BAR_COLOR, color); + return this; } /** * @param color - desired resolved color of the statusbar */ - public void setStatusBarColor(@ColorInt int color) { + public Options setStatusBarColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_STATUS_BAR_COLOR, color); + return this; } /** * @param color - desired resolved color of the active and selected widget and progress wheel middle line (default is white) */ - public void setActiveControlsWidgetColor(@ColorInt int color) { + public Options setActiveControlsWidgetColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_UCROP_COLOR_CONTROLS_WIDGET_ACTIVE, color); + return this; } /** * @param color - desired resolved color of Toolbar text and buttons (default is darker orange) */ - public void setToolbarWidgetColor(@ColorInt int color) { + public Options setToolbarWidgetColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_UCROP_WIDGET_COLOR_TOOLBAR, color); + return this; } /** * @param text - desired text for Toolbar title */ - public void setToolbarTitle(@Nullable String text) { + public Options setToolbarTitle(@Nullable String text) { mOptionBundle.putString(EXTRA_UCROP_TITLE_TEXT_TOOLBAR, text); + return this; } /** * @param drawable - desired drawable for the Toolbar left cancel icon */ - public void setToolbarCancelDrawable(@DrawableRes int drawable) { + public Options setToolbarCancelDrawable(@DrawableRes int drawable) { mOptionBundle.putInt(EXTRA_UCROP_WIDGET_CANCEL_DRAWABLE, drawable); + return this; } /** * @param drawable - desired drawable for the Toolbar right crop icon */ - public void setToolbarCropDrawable(@DrawableRes int drawable) { + public Options setToolbarCropDrawable(@DrawableRes int drawable) { mOptionBundle.putInt(EXTRA_UCROP_WIDGET_CROP_DRAWABLE, drawable); + return this; } /** * @param color - desired resolved color of logo fill (default is darker grey) */ - public void setLogoColor(@ColorInt int color) { + public Options setLogoColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_UCROP_LOGO_COLOR, color); + return this; } /** * @param hide - set to true to hide the bottom controls (shown by default) */ - public void setHideBottomControls(boolean hide) { + public Options setHideBottomControls(boolean hide) { mOptionBundle.putBoolean(EXTRA_HIDE_BOTTOM_CONTROLS, hide); + return this; } /** * @param enabled - set to true to let user resize crop bounds (disabled by default) */ - public void setFreeStyleCropEnabled(boolean enabled) { + public Options setFreeStyleCropEnabled(boolean enabled) { mOptionBundle.putBoolean(EXTRA_FREE_STYLE_CROP, enabled); + return this; } /** @@ -443,7 +467,7 @@ public void setFreeStyleCropEnabled(boolean enabled) { * @param selectedByDefault - index of aspect ratio option that is selected by default (starts with 0). * @param aspectRatio - list of aspect ratio options that are available to user */ - public void setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRatio) { + public Options setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRatio) { if (selectedByDefault > aspectRatio.length) { throw new IllegalArgumentException(String.format(Locale.US, "Index [selectedByDefault = %d] cannot be higher than aspect ratio options count [count = %d].", @@ -451,13 +475,15 @@ public void setAspectRatioOptions(int selectedByDefault, AspectRatio... aspectRa } mOptionBundle.putInt(EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, selectedByDefault); mOptionBundle.putParcelableArrayList(EXTRA_ASPECT_RATIO_OPTIONS, new ArrayList(Arrays.asList(aspectRatio))); + return this; } /** * @param color - desired background color that should be applied to the root view */ - public void setRootViewBackgroundColor(@ColorInt int color) { + public Options setRootViewBackgroundColor(@ColorInt int color) { mOptionBundle.putInt(EXTRA_UCROP_ROOT_VIEW_BACKGROUND_COLOR, color); + return this; } /** @@ -467,18 +493,20 @@ public void setRootViewBackgroundColor(@ColorInt int color) { * @param x aspect ratio X * @param y aspect ratio Y */ - public void withAspectRatio(float x, float y) { + public Options withAspectRatio(float x, float y) { mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_X, x); mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_Y, y); + return this; } /** * Set an aspect ratio for crop bounds that is evaluated from source image width and height. * User won't see the menu with other ratios options. */ - public void useSourceImageAspectRatio() { + public Options useSourceImageAspectRatio() { mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_X, 0); mOptionBundle.putFloat(EXTRA_ASPECT_RATIO_Y, 0); + return this; } /** @@ -487,9 +515,10 @@ public void useSourceImageAspectRatio() { * @param width max cropped image width * @param height max cropped image height */ - public void withMaxResultSize(@IntRange(from = MIN_SIZE) int width, @IntRange(from = MIN_SIZE) int height) { + public Options withMaxResultSize(@IntRange(from = MIN_SIZE) int width, @IntRange(from = MIN_SIZE) int height) { mOptionBundle.putInt(EXTRA_MAX_SIZE_X, width); mOptionBundle.putInt(EXTRA_MAX_SIZE_Y, height); + return this; } } diff --git a/image/src/main/java/com/yalantis/ucrop/UCropActivity.java b/image/src/main/java/com/yalantis/ucrop/UCropActivity.java index 7661e9926..b7cc37c3a 100644 --- a/image/src/main/java/com/yalantis/ucrop/UCropActivity.java +++ b/image/src/main/java/com/yalantis/ucrop/UCropActivity.java @@ -7,6 +7,7 @@ import android.graphics.drawable.Animatable; import android.graphics.drawable.Drawable; import android.net.Uri; +import android.os.Build; import android.os.Bundle; import android.text.TextUtils; import android.util.Log; @@ -175,7 +176,7 @@ public void onCreateMenu(@NonNull Menu menu, @NonNull MenuInflater menuInflater) Drawable menuItemCropIcon = ContextCompat.getDrawable(this, mToolbarCropDrawable); if (menuItemCropIcon != null) { menuItemCropIcon.mutate(); - menuItemCropIcon.setColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP); + menuItemCropIcon.setColorFilter(new PorterDuffColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP)); menuItemCrop.setIcon(menuItemCropIcon); } } @@ -209,9 +210,17 @@ protected void onStop() { /** * This method extracts all data from the incoming intent and setups views properly. */ + @SuppressWarnings("deprecation") private void setImageData(@NonNull Intent intent) { - Uri inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI); - Uri outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI); + Uri inputUri; + Uri outputUri; + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) { + inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI, Uri.class); + outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI, Uri.class); + } else { + inputUri = intent.getParcelableExtra(UCrop.EXTRA_INPUT_URI); + outputUri = intent.getParcelableExtra(UCrop.EXTRA_OUTPUT_URI); + } processOptions(intent); if (inputUri != null && outputUri != null) { @@ -362,7 +371,7 @@ private void setupAppBar() { // Color buttons inside the Toolbar Drawable stateButtonDrawable = ContextCompat.getDrawable(this, mToolbarCancelDrawable).mutate(); - stateButtonDrawable.setColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP); + stateButtonDrawable.setColorFilter(new PorterDuffColorFilter(mToolbarWidgetColor, PorterDuff.Mode.SRC_ATOP)); toolbar.setNavigationIcon(stateButtonDrawable); setSupportActionBar(toolbar); @@ -416,10 +425,15 @@ private void setStatusBarColor(@ColorInt int color) { } } + @SuppressWarnings("deprecation") private void setupAspectRatioWidget(@NonNull Intent intent) { - int aspectRationSelectedByDefault = intent.getIntExtra(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0); - ArrayList aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS); + ArrayList aspectRatioList; + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) { + aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS, AspectRatio.class); + } else { + aspectRatioList = intent.getParcelableArrayListExtra(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS); + } if (aspectRatioList == null || aspectRatioList.isEmpty()) { aspectRationSelectedByDefault = 2; @@ -596,9 +610,9 @@ private void setWidgetState(@IdRes int stateViewId) { private void changeSelectedTab(int stateViewId) { TransitionManager.beginDelayedTransition(findViewById(R.id.ucrop_photobox), mControlsTransition); - mWrapperStateScale.findViewById(R.id.text_view_scale).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE); + mWrapperStateScale.findViewById(R.id.text_view_scale_info).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE); mWrapperStateAspectRatio.findViewById(R.id.text_view_crop).setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE); - mWrapperStateRotate.findViewById(R.id.text_view_rotate).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE); + mWrapperStateRotate.findViewById(R.id.text_view_rotate_info).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE); } diff --git a/image/src/main/java/com/yalantis/ucrop/UCropFragment.java b/image/src/main/java/com/yalantis/ucrop/UCropFragment.java deleted file mode 100644 index a6345d1ce..000000000 --- a/image/src/main/java/com/yalantis/ucrop/UCropFragment.java +++ /dev/null @@ -1,569 +0,0 @@ -package com.yalantis.ucrop; - -import static androidx.appcompat.app.AppCompatActivity.RESULT_OK; - -import android.content.Context; -import android.content.Intent; -import android.graphics.Bitmap; -import android.graphics.PorterDuff; -import android.net.Uri; -import android.os.Bundle; -import android.text.TextUtils; -import android.view.LayoutInflater; -import android.view.View; -import android.view.ViewGroup; -import android.view.animation.AccelerateInterpolator; -import android.widget.FrameLayout; -import android.widget.ImageView; -import android.widget.LinearLayout; -import android.widget.RelativeLayout; -import android.widget.TextView; - -import androidx.annotation.ColorInt; -import androidx.annotation.IdRes; -import androidx.annotation.IntDef; -import androidx.annotation.NonNull; -import androidx.annotation.Nullable; -import androidx.appcompat.app.AppCompatDelegate; -import androidx.core.content.ContextCompat; -import androidx.fragment.app.Fragment; -import androidx.transition.AutoTransition; -import androidx.transition.Transition; -import androidx.transition.TransitionManager; - -import com.yalantis.ucrop.callback.BitmapCropCallback; -import com.yalantis.ucrop.model.AspectRatio; -import com.yalantis.ucrop.util.SelectedStateListDrawable; -import com.yalantis.ucrop.view.CropImageView; -import com.yalantis.ucrop.view.GestureCropImageView; -import com.yalantis.ucrop.view.OverlayView; -import com.yalantis.ucrop.view.TransformImageView; -import com.yalantis.ucrop.view.UCropView; -import com.yalantis.ucrop.view.widget.AspectRatioTextView; -import com.yalantis.ucrop.view.widget.HorizontalProgressWheelView; - -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -import me.minetsh.imaging.R; - -@SuppressWarnings("ConstantConditions") -public class UCropFragment extends Fragment { - - public static final int DEFAULT_COMPRESS_QUALITY = 90; - public static final Bitmap.CompressFormat DEFAULT_COMPRESS_FORMAT = Bitmap.CompressFormat.JPEG; - - public static final int NONE = 0; - public static final int SCALE = 1; - public static final int ROTATE = 2; - public static final int ALL = 3; - public static final String TAG = "UCropFragment"; - private static final long CONTROLS_ANIMATION_DURATION = 50; - private static final int TABS_COUNT = 3; - private static final int SCALE_WIDGET_SENSITIVITY_COEFFICIENT = 15000; - private static final int ROTATE_WIDGET_SENSITIVITY_COEFFICIENT = 42; - - static { - AppCompatDelegate.setCompatVectorFromResourcesEnabled(true); - } - - private final List mCropAspectRatioViews = new ArrayList<>(); - UCropFragmentCallback callback; - UCropView mUCropView; - GestureCropImageView mGestureCropImageView; - View mBlockingView; - private int mActiveControlsWidgetColor; - @ColorInt - private int mRootViewBackgroundColor; - private int mLogoColor; - private boolean mShowBottomControls; - private Transition mControlsTransition; - private OverlayView mOverlayView; - private ViewGroup mWrapperStateAspectRatio, mWrapperStateRotate, mWrapperStateScale; - private ViewGroup mLayoutAspectRatio, mLayoutRotate, mLayoutScale; - private TextView mTextViewRotateAngle, mTextViewScalePercent; - private final TransformImageView.TransformImageListener mImageListener = new TransformImageView.TransformImageListener() { - @Override - public void onRotate(float currentAngle) { - setAngleText(currentAngle); - } - - @Override - public void onScale(float currentScale) { - setScaleText(currentScale); - } - - @Override - public void onLoadComplete() { - mUCropView.animate().alpha(1).setDuration(300).setInterpolator(new AccelerateInterpolator()); - mBlockingView.setClickable(false); - callback.loadingProgress(false); - } - - @Override - public void onLoadFailure(@NonNull Exception e) { - callback.onCropFinish(getError(e)); - } - - }; - private Bitmap.CompressFormat mCompressFormat = DEFAULT_COMPRESS_FORMAT; - private int mCompressQuality = DEFAULT_COMPRESS_QUALITY; - private int[] mAllowedGestures = {SCALE, ROTATE, ALL}; - private final View.OnClickListener mStateClickListener = v -> { - if (!v.isSelected()) { - setWidgetState(v.getId()); - } - }; - - public static UCropFragment newInstance(Bundle uCrop) { - UCropFragment fragment = new UCropFragment(); - fragment.setArguments(uCrop); - return fragment; - } - - @Override - public void onAttach(@NonNull Context context) { - super.onAttach(context); - if (getParentFragment() instanceof UCropFragmentCallback) - callback = (UCropFragmentCallback) getParentFragment(); - else if (context instanceof UCropFragmentCallback) - callback = (UCropFragmentCallback) context; - else - throw new IllegalArgumentException(context - + " must implement UCropFragmentCallback"); - } - - public void setCallback(UCropFragmentCallback callback) { - this.callback = callback; - } - - @Nullable - @Override - public View onCreateView(@NonNull LayoutInflater inflater, @Nullable ViewGroup container, @Nullable Bundle savedInstanceState) { - View rootView = inflater.inflate(R.layout.ucrop_fragment_photobox, container, false); - - Bundle args = getArguments(); - - setupViews(rootView, args); - setImageData(args); - setInitialState(); - addBlockingView(rootView); - - return rootView; - } - - public void setupViews(View view, Bundle args) { - mActiveControlsWidgetColor = args.getInt(UCrop.Options.EXTRA_UCROP_COLOR_CONTROLS_WIDGET_ACTIVE, ContextCompat.getColor(getContext(), R.color.ucrop_color_widget_active)); - mLogoColor = args.getInt(UCrop.Options.EXTRA_UCROP_LOGO_COLOR, ContextCompat.getColor(getContext(), R.color.ucrop_color_default_logo)); - mShowBottomControls = !args.getBoolean(UCrop.Options.EXTRA_HIDE_BOTTOM_CONTROLS, false); - mRootViewBackgroundColor = args.getInt(UCrop.Options.EXTRA_UCROP_ROOT_VIEW_BACKGROUND_COLOR, ContextCompat.getColor(getContext(), R.color.ucrop_color_crop_background)); - - initiateRootViews(view); - callback.loadingProgress(true); - - if (mShowBottomControls) { - - ViewGroup wrapper = view.findViewById(R.id.controls_wrapper); - wrapper.setVisibility(View.VISIBLE); - LayoutInflater.from(getContext()).inflate(R.layout.ucrop_controls, wrapper, true); - - mControlsTransition = new AutoTransition(); - mControlsTransition.setDuration(CONTROLS_ANIMATION_DURATION); - - mWrapperStateAspectRatio = view.findViewById(R.id.state_aspect_ratio); - mWrapperStateAspectRatio.setOnClickListener(mStateClickListener); - mWrapperStateRotate = view.findViewById(R.id.state_rotate); - mWrapperStateRotate.setOnClickListener(mStateClickListener); - mWrapperStateScale = view.findViewById(R.id.state_scale); - mWrapperStateScale.setOnClickListener(mStateClickListener); - - mLayoutAspectRatio = view.findViewById(R.id.layout_aspect_ratio); - mLayoutRotate = view.findViewById(R.id.layout_rotate_wheel); - mLayoutScale = view.findViewById(R.id.layout_scale_wheel); - - setupAspectRatioWidget(args, view); - setupRotateWidget(view); - setupScaleWidget(view); - setupStatesWrapper(view); - } else { - RelativeLayout.LayoutParams params = (RelativeLayout.LayoutParams) view.findViewById(R.id.ucrop_frame).getLayoutParams(); - params.bottomMargin = 0; - view.findViewById(R.id.ucrop_frame).requestLayout(); - } - } - - private void setImageData(@NonNull Bundle bundle) { - Uri inputUri = bundle.getParcelable(UCrop.EXTRA_INPUT_URI); - Uri outputUri = bundle.getParcelable(UCrop.EXTRA_OUTPUT_URI); - processOptions(bundle); - - if (inputUri != null && outputUri != null) { - try { - mGestureCropImageView.setImageUri(inputUri, outputUri); - } catch (Exception e) { - callback.onCropFinish(getError(e)); - } - } else { - callback.onCropFinish(getError(new NullPointerException(getString(R.string.ucrop_error_input_data_is_absent)))); - } - } - - /** - * This method extracts {@link com.yalantis.ucrop.UCrop.Options #optionsBundle} from incoming bundle - * and setups fragment, {@link OverlayView} and {@link CropImageView} properly. - */ - @SuppressWarnings("deprecation") - private void processOptions(@NonNull Bundle bundle) { - // Bitmap compression options - String compressionFormatName = bundle.getString(UCrop.Options.EXTRA_COMPRESSION_FORMAT_NAME); - Bitmap.CompressFormat compressFormat = null; - if (!TextUtils.isEmpty(compressionFormatName)) { - compressFormat = Bitmap.CompressFormat.valueOf(compressionFormatName); - } - mCompressFormat = (compressFormat == null) ? DEFAULT_COMPRESS_FORMAT : compressFormat; - - mCompressQuality = bundle.getInt(UCrop.Options.EXTRA_COMPRESSION_QUALITY, UCropActivity.DEFAULT_COMPRESS_QUALITY); - - // Gestures options - int[] allowedGestures = bundle.getIntArray(UCrop.Options.EXTRA_ALLOWED_GESTURES); - if (allowedGestures != null && allowedGestures.length == TABS_COUNT) { - mAllowedGestures = allowedGestures; - } - - // Crop image view options - mGestureCropImageView.setMaxBitmapSize(bundle.getInt(UCrop.Options.EXTRA_MAX_BITMAP_SIZE, CropImageView.DEFAULT_MAX_BITMAP_SIZE)); - mGestureCropImageView.setMaxScaleMultiplier(bundle.getFloat(UCrop.Options.EXTRA_MAX_SCALE_MULTIPLIER, CropImageView.DEFAULT_MAX_SCALE_MULTIPLIER)); - mGestureCropImageView.setImageToWrapCropBoundsAnimDuration(bundle.getInt(UCrop.Options.EXTRA_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION, CropImageView.DEFAULT_IMAGE_TO_CROP_BOUNDS_ANIM_DURATION)); - - // Overlay view options - mOverlayView.setFreestyleCropEnabled(bundle.getBoolean(UCrop.Options.EXTRA_FREE_STYLE_CROP, OverlayView.DEFAULT_FREESTYLE_CROP_MODE != OverlayView.FREESTYLE_CROP_MODE_DISABLE)); - - mOverlayView.setDimmedColor(bundle.getInt(UCrop.Options.EXTRA_DIMMED_LAYER_COLOR, getResources().getColor(R.color.ucrop_color_default_dimmed))); - mOverlayView.setCircleDimmedLayer(bundle.getBoolean(UCrop.Options.EXTRA_CIRCLE_DIMMED_LAYER, OverlayView.DEFAULT_CIRCLE_DIMMED_LAYER)); - - mOverlayView.setShowCropFrame(bundle.getBoolean(UCrop.Options.EXTRA_SHOW_CROP_FRAME, OverlayView.DEFAULT_SHOW_CROP_FRAME)); - mOverlayView.setCropFrameColor(bundle.getInt(UCrop.Options.EXTRA_CROP_FRAME_COLOR, getResources().getColor(R.color.ucrop_color_default_crop_frame))); - mOverlayView.setCropFrameStrokeWidth(bundle.getInt(UCrop.Options.EXTRA_CROP_FRAME_STROKE_WIDTH, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_frame_stoke_width))); - - mOverlayView.setShowCropGrid(bundle.getBoolean(UCrop.Options.EXTRA_SHOW_CROP_GRID, OverlayView.DEFAULT_SHOW_CROP_GRID)); - mOverlayView.setCropGridRowCount(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_ROW_COUNT, OverlayView.DEFAULT_CROP_GRID_ROW_COUNT)); - mOverlayView.setCropGridColumnCount(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_COLUMN_COUNT, OverlayView.DEFAULT_CROP_GRID_COLUMN_COUNT)); - mOverlayView.setCropGridColor(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_COLOR, getResources().getColor(R.color.ucrop_color_default_crop_grid))); - mOverlayView.setCropGridStrokeWidth(bundle.getInt(UCrop.Options.EXTRA_CROP_GRID_STROKE_WIDTH, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_grid_stoke_width))); - - // Aspect ratio options - float aspectRatioX = bundle.getFloat(UCrop.EXTRA_ASPECT_RATIO_X, 0); - float aspectRatioY = bundle.getFloat(UCrop.EXTRA_ASPECT_RATIO_Y, 0); - - int aspectRationSelectedByDefault = bundle.getInt(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0); - ArrayList aspectRatioList = bundle.getParcelableArrayList(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS); - - if (aspectRatioX > 0 && aspectRatioY > 0) { - if (mWrapperStateAspectRatio != null) { - mWrapperStateAspectRatio.setVisibility(View.GONE); - } - mGestureCropImageView.setTargetAspectRatio(aspectRatioX / aspectRatioY); - } else if (aspectRatioList != null && aspectRationSelectedByDefault < aspectRatioList.size()) { - mGestureCropImageView.setTargetAspectRatio(aspectRatioList.get(aspectRationSelectedByDefault).getAspectRatioX() / - aspectRatioList.get(aspectRationSelectedByDefault).getAspectRatioY()); - } else { - mGestureCropImageView.setTargetAspectRatio(CropImageView.SOURCE_IMAGE_ASPECT_RATIO); - } - - // Result bitmap max size options - int maxSizeX = bundle.getInt(UCrop.EXTRA_MAX_SIZE_X, 0); - int maxSizeY = bundle.getInt(UCrop.EXTRA_MAX_SIZE_Y, 0); - - if (maxSizeX > 0 && maxSizeY > 0) { - mGestureCropImageView.setMaxResultImageSizeX(maxSizeX); - mGestureCropImageView.setMaxResultImageSizeY(maxSizeY); - } - } - - private void initiateRootViews(View view) { - mUCropView = view.findViewById(R.id.ucrop); - mGestureCropImageView = mUCropView.getCropImageView(); - mOverlayView = mUCropView.getOverlayView(); - - mGestureCropImageView.setTransformImageListener(mImageListener); - - ((ImageView) view.findViewById(R.id.image_view_logo)).setColorFilter(mLogoColor, PorterDuff.Mode.SRC_ATOP); - - view.findViewById(R.id.ucrop_frame).setBackgroundColor(mRootViewBackgroundColor); - } - - /** - * Use {@link #mActiveControlsWidgetColor} for color filter - */ - private void setupStatesWrapper(View view) { - ImageView stateScaleImageView = view.findViewById(R.id.image_view_state_scale); - ImageView stateRotateImageView = view.findViewById(R.id.image_view_state_rotate); - ImageView stateAspectRatioImageView = view.findViewById(R.id.image_view_state_aspect_ratio); - - stateScaleImageView.setImageDrawable(new SelectedStateListDrawable(stateScaleImageView.getDrawable(), mActiveControlsWidgetColor)); - stateRotateImageView.setImageDrawable(new SelectedStateListDrawable(stateRotateImageView.getDrawable(), mActiveControlsWidgetColor)); - stateAspectRatioImageView.setImageDrawable(new SelectedStateListDrawable(stateAspectRatioImageView.getDrawable(), mActiveControlsWidgetColor)); - } - - private void setupAspectRatioWidget(@NonNull Bundle bundle, View view) { - int aspectRationSelectedByDefault = bundle.getInt(UCrop.Options.EXTRA_ASPECT_RATIO_SELECTED_BY_DEFAULT, 0); - ArrayList aspectRatioList = bundle.getParcelableArrayList(UCrop.Options.EXTRA_ASPECT_RATIO_OPTIONS); - - if (aspectRatioList == null || aspectRatioList.isEmpty()) { - aspectRationSelectedByDefault = 2; - - aspectRatioList = new ArrayList<>(); - aspectRatioList.add(new AspectRatio(null, 1, 1)); - aspectRatioList.add(new AspectRatio(null, 3, 4)); - aspectRatioList.add(new AspectRatio(getString(R.string.ucrop_label_original).toUpperCase(), - CropImageView.SOURCE_IMAGE_ASPECT_RATIO, CropImageView.SOURCE_IMAGE_ASPECT_RATIO)); - aspectRatioList.add(new AspectRatio(null, 3, 2)); - aspectRatioList.add(new AspectRatio(null, 16, 9)); - } - - LinearLayout wrapperAspectRatioList = view.findViewById(R.id.layout_aspect_ratio); - - FrameLayout wrapperAspectRatio; - AspectRatioTextView aspectRatioTextView; - LinearLayout.LayoutParams lp = new LinearLayout.LayoutParams(0, ViewGroup.LayoutParams.MATCH_PARENT); - lp.weight = 1; - for (AspectRatio aspectRatio : aspectRatioList) { - wrapperAspectRatio = (FrameLayout) getLayoutInflater().inflate(R.layout.ucrop_aspect_ratio, null); - wrapperAspectRatio.setLayoutParams(lp); - aspectRatioTextView = ((AspectRatioTextView) wrapperAspectRatio.getChildAt(0)); - aspectRatioTextView.setActiveColor(mActiveControlsWidgetColor); - aspectRatioTextView.setAspectRatio(aspectRatio); - - wrapperAspectRatioList.addView(wrapperAspectRatio); - mCropAspectRatioViews.add(wrapperAspectRatio); - } - - mCropAspectRatioViews.get(aspectRationSelectedByDefault).setSelected(true); - - for (ViewGroup cropAspectRatioView : mCropAspectRatioViews) { - cropAspectRatioView.setOnClickListener(v -> { - mGestureCropImageView.setTargetAspectRatio( - ((AspectRatioTextView) ((ViewGroup) v).getChildAt(0)).getAspectRatio(v.isSelected())); - mGestureCropImageView.setImageToWrapCropBounds(); - if (!v.isSelected()) { - for (ViewGroup cropAspectRatioView1 : mCropAspectRatioViews) { - cropAspectRatioView1.setSelected(cropAspectRatioView1 == v); - } - } - }); - } - } - - private void setupRotateWidget(View view) { - mTextViewRotateAngle = view.findViewById(R.id.text_view_rotate); - ((HorizontalProgressWheelView) view.findViewById(R.id.rotate_scroll_wheel)) - .setScrollingListener(new HorizontalProgressWheelView.ScrollingListener() { - @Override - public void onScroll(float delta, float totalDistance) { - mGestureCropImageView.postRotate(delta / ROTATE_WIDGET_SENSITIVITY_COEFFICIENT); - } - - @Override - public void onScrollEnd() { - mGestureCropImageView.setImageToWrapCropBounds(); - } - - @Override - public void onScrollStart() { - mGestureCropImageView.cancelAllAnimations(); - } - }); - - ((HorizontalProgressWheelView) view.findViewById(R.id.rotate_scroll_wheel)).setMiddleLineColor(mActiveControlsWidgetColor); - - - view.findViewById(R.id.wrapper_reset_rotate).setOnClickListener(v -> resetRotation()); - view.findViewById(R.id.wrapper_rotate_by_angle).setOnClickListener(v -> rotateByAngle(90)); - - setAngleTextColor(mActiveControlsWidgetColor); - } - - private void setupScaleWidget(View view) { - mTextViewScalePercent = view.findViewById(R.id.text_view_scale); - ((HorizontalProgressWheelView) view.findViewById(R.id.scale_scroll_wheel)) - .setScrollingListener(new HorizontalProgressWheelView.ScrollingListener() { - @Override - public void onScroll(float delta, float totalDistance) { - if (delta > 0) { - mGestureCropImageView.zoomInImage(mGestureCropImageView.getCurrentScale() - + delta * ((mGestureCropImageView.getMaxScale() - mGestureCropImageView.getMinScale()) / SCALE_WIDGET_SENSITIVITY_COEFFICIENT)); - } else { - mGestureCropImageView.zoomOutImage(mGestureCropImageView.getCurrentScale() - + delta * ((mGestureCropImageView.getMaxScale() - mGestureCropImageView.getMinScale()) / SCALE_WIDGET_SENSITIVITY_COEFFICIENT)); - } - } - - @Override - public void onScrollEnd() { - mGestureCropImageView.setImageToWrapCropBounds(); - } - - @Override - public void onScrollStart() { - mGestureCropImageView.cancelAllAnimations(); - } - }); - ((HorizontalProgressWheelView) view.findViewById(R.id.scale_scroll_wheel)).setMiddleLineColor(mActiveControlsWidgetColor); - - setScaleTextColor(mActiveControlsWidgetColor); - } - - void setAngleText(float angle) { - if (mTextViewRotateAngle != null) { - mTextViewRotateAngle.setText(String.format(Locale.getDefault(), "%.1f°", angle)); - } - } - - private void setAngleTextColor(int textColor) { - if (mTextViewRotateAngle != null) { - mTextViewRotateAngle.setTextColor(textColor); - } - } - - void setScaleText(float scale) { - if (mTextViewScalePercent != null) { - mTextViewScalePercent.setText(String.format(Locale.getDefault(), "%d%%", (int) (scale * 100))); - } - } - - private void setScaleTextColor(int textColor) { - if (mTextViewScalePercent != null) { - mTextViewScalePercent.setTextColor(textColor); - } - } - - private void resetRotation() { - mGestureCropImageView.postRotate(-mGestureCropImageView.getCurrentAngle()); - mGestureCropImageView.setImageToWrapCropBounds(); - } - - private void rotateByAngle(int angle) { - mGestureCropImageView.postRotate(angle); - mGestureCropImageView.setImageToWrapCropBounds(); - } - - private void setInitialState() { - if (mShowBottomControls) { - if (mWrapperStateAspectRatio.getVisibility() == View.VISIBLE) { - setWidgetState(R.id.state_aspect_ratio); - } else { - setWidgetState(R.id.state_scale); - } - } else { - setAllowedGestures(0); - } - } - - private void setWidgetState(@IdRes int stateViewId) { - if (!mShowBottomControls) return; - - mWrapperStateAspectRatio.setSelected(stateViewId == R.id.state_aspect_ratio); - mWrapperStateRotate.setSelected(stateViewId == R.id.state_rotate); - mWrapperStateScale.setSelected(stateViewId == R.id.state_scale); - - mLayoutAspectRatio.setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE); - mLayoutRotate.setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE); - mLayoutScale.setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE); - - changeSelectedTab(stateViewId); - - if (stateViewId == R.id.state_scale) { - setAllowedGestures(0); - } else if (stateViewId == R.id.state_rotate) { - setAllowedGestures(1); - } else { - setAllowedGestures(2); - } - } - - private void changeSelectedTab(int stateViewId) { - if (getView() != null) { - TransitionManager.beginDelayedTransition(getView().findViewById(R.id.ucrop_photobox), mControlsTransition); - } - mWrapperStateScale.findViewById(R.id.text_view_scale).setVisibility(stateViewId == R.id.state_scale ? View.VISIBLE : View.GONE); - mWrapperStateAspectRatio.findViewById(R.id.text_view_crop).setVisibility(stateViewId == R.id.state_aspect_ratio ? View.VISIBLE : View.GONE); - mWrapperStateRotate.findViewById(R.id.text_view_rotate).setVisibility(stateViewId == R.id.state_rotate ? View.VISIBLE : View.GONE); - } - - private void setAllowedGestures(int tab) { - mGestureCropImageView.setScaleEnabled(mAllowedGestures[tab] == ALL || mAllowedGestures[tab] == SCALE); - mGestureCropImageView.setRotateEnabled(mAllowedGestures[tab] == ALL || mAllowedGestures[tab] == ROTATE); - } - - /** - * Adds view that covers everything below the Toolbar. - * When it's clickable - user won't be able to click/touch anything below the Toolbar. - * Need to block user input while loading and cropping an image. - */ - private void addBlockingView(View view) { - if (mBlockingView == null) { - mBlockingView = new View(getContext()); - RelativeLayout.LayoutParams lp = new RelativeLayout.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.MATCH_PARENT); - mBlockingView.setLayoutParams(lp); - mBlockingView.setClickable(true); - } - - ((RelativeLayout) view.findViewById(R.id.ucrop_photobox)).addView(mBlockingView); - } - - public void cropAndSaveImage() { - mBlockingView.setClickable(true); - callback.loadingProgress(true); - - mGestureCropImageView.cropAndSaveImage(mCompressFormat, mCompressQuality, new BitmapCropCallback() { - - @Override - public void onBitmapCropped(@NonNull Uri resultUri, int offsetX, int offsetY, int imageWidth, int imageHeight) { - callback.onCropFinish(getResult(resultUri, mGestureCropImageView.getTargetAspectRatio(), offsetX, offsetY, imageWidth, imageHeight)); - callback.loadingProgress(false); - } - - @Override - public void onCropFailure(@NonNull Throwable t) { - callback.onCropFinish(getError(t)); - } - }); - } - - protected UCropResult getResult(Uri uri, float resultAspectRatio, int offsetX, int offsetY, int imageWidth, int imageHeight) { - return new UCropResult(RESULT_OK, new Intent() - .putExtra(UCrop.EXTRA_OUTPUT_URI, uri) - .putExtra(UCrop.EXTRA_OUTPUT_CROP_ASPECT_RATIO, resultAspectRatio) - .putExtra(UCrop.EXTRA_OUTPUT_IMAGE_WIDTH, imageWidth) - .putExtra(UCrop.EXTRA_OUTPUT_IMAGE_HEIGHT, imageHeight) - .putExtra(UCrop.EXTRA_OUTPUT_OFFSET_X, offsetX) - .putExtra(UCrop.EXTRA_OUTPUT_OFFSET_Y, offsetY) - ); - } - - protected UCropResult getError(Throwable throwable) { - return new UCropResult(UCrop.RESULT_ERROR, new Intent().putExtra(UCrop.EXTRA_ERROR, throwable)); - } - - @IntDef({NONE, SCALE, ROTATE, ALL}) - @Retention(RetentionPolicy.SOURCE) - public @interface GestureTypes { - } - - public static class UCropResult { - - public final int mResultCode; - public final Intent mResultData; - - public UCropResult(int resultCode, Intent data) { - mResultCode = resultCode; - mResultData = data; - } - - } - -} - diff --git a/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java b/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java deleted file mode 100644 index 31ce42554..000000000 --- a/image/src/main/java/com/yalantis/ucrop/UCropFragmentCallback.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.yalantis.ucrop; - -public interface UCropFragmentCallback { - - /** - * Return loader status - */ - void loadingProgress(boolean showLoader); - - /** - * Return cropping result or error - */ - void onCropFinish(UCropFragment.UCropResult result); - -} diff --git a/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java b/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java index 9132171e6..1b7aaabd9 100644 --- a/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java +++ b/image/src/main/java/com/yalantis/ucrop/task/BitmapCropTask.java @@ -1,5 +1,6 @@ package com.yalantis.ucrop.task; +import android.annotation.SuppressLint; import android.content.Context; import android.graphics.Bitmap; import android.graphics.Matrix; @@ -83,6 +84,7 @@ private static CompletableTransformer applyCompletableIOToMainSchedulers() { .observeOn(AndroidSchedulers.mainThread()); } + @SuppressLint("CheckResult") public void execute() { doInBackground().compose(applyCompletableIOToMainSchedulers()) .subscribe(() -> { diff --git a/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java b/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java index 196d96c34..9e74a6e13 100644 --- a/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java +++ b/image/src/main/java/com/yalantis/ucrop/task/BitmapLoadTask.java @@ -62,6 +62,7 @@ private static SingleTransformer applySingleIOToMainSchedulers() { .observeOn(AndroidSchedulers.mainThread()); } + @SuppressLint("CheckResult") public void execute() { doInBackground().compose(applySingleIOToMainSchedulers()).subscribe(this::onPostExecute, e -> onPostExecute(new BitmapWorkerResult((Exception) e))); } diff --git a/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java b/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java index 7b8e5a8ec..720607389 100644 --- a/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java +++ b/image/src/main/java/com/yalantis/ucrop/util/BitmapLoadUtils.java @@ -1,15 +1,14 @@ package com.yalantis.ucrop.util; import android.content.Context; +import android.content.res.Resources; import android.graphics.Bitmap; import android.graphics.BitmapFactory; import android.graphics.Canvas; import android.graphics.Matrix; -import android.graphics.Point; import android.net.Uri; +import android.util.DisplayMetrics; import android.util.Log; -import android.view.Display; -import android.view.WindowManager; import androidx.annotation.NonNull; import androidx.annotation.Nullable; @@ -124,19 +123,13 @@ public static int exifToTranslation(int exifOrientation) { * * @return - max bitmap size in pixels. */ - public static int calculateMaxBitmapSize(@NonNull Context context) { - WindowManager wm = (WindowManager) context.getSystemService(Context.WINDOW_SERVICE); - Display display; + public static int calculateMaxBitmapSize() { int width, height; - Point size = new Point(); - if (wm != null) { - display = wm.getDefaultDisplay(); - display.getSize(size); - } + DisplayMetrics dm = Resources.getSystem().getDisplayMetrics(); - width = size.x; - height = size.y; + width = dm.widthPixels; + height = dm.heightPixels; // Twice the device screen diagonal as default int maxBitmapSize = (int) Math.sqrt(Math.pow(width, 2) + Math.pow(height, 2)); diff --git a/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java b/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java index 53c621bb5..74913c118 100644 --- a/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java +++ b/image/src/main/java/com/yalantis/ucrop/util/FastBitmapDrawable.java @@ -22,6 +22,8 @@ import android.graphics.PixelFormat; import android.graphics.drawable.Drawable; +import androidx.annotation.NonNull; + public class FastBitmapDrawable extends Drawable { private final Paint mPaint = new Paint(Paint.FILTER_BITMAP_FLAG); @@ -36,7 +38,7 @@ public FastBitmapDrawable(Bitmap b) { } @Override - public void draw(Canvas canvas) { + public void draw(@NonNull Canvas canvas) { if (mBitmap != null && !mBitmap.isRecycled()) { canvas.drawBitmap(mBitmap, null, getBounds(), mPaint); } diff --git a/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java b/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java index 9943047cc..b0d02e483 100644 --- a/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java +++ b/image/src/main/java/com/yalantis/ucrop/util/SelectedStateListDrawable.java @@ -1,6 +1,7 @@ package com.yalantis.ucrop.util; import android.graphics.PorterDuff; +import android.graphics.PorterDuffColorFilter; import android.graphics.drawable.Drawable; import android.graphics.drawable.StateListDrawable; @@ -27,7 +28,7 @@ protected boolean onStateChange(int[] states) { } } if (isStatePressedInArray) { - setColorFilter(mSelectionColor, PorterDuff.Mode.SRC_ATOP); + setColorFilter(new PorterDuffColorFilter(mSelectionColor, PorterDuff.Mode.SRC_ATOP)); } else { clearColorFilter(); } diff --git a/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java b/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java index a597995f4..0709c9f07 100644 --- a/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java +++ b/image/src/main/java/com/yalantis/ucrop/view/GestureCropImageView.java @@ -7,6 +7,8 @@ import android.view.MotionEvent; import android.view.ScaleGestureDetector; +import androidx.annotation.NonNull; + import com.yalantis.ucrop.util.RotationGestureDetector; /** @@ -131,7 +133,7 @@ public boolean onDoubleTap(MotionEvent e) { } @Override - public boolean onScroll(MotionEvent e1, MotionEvent e2, float distanceX, float distanceY) { + public boolean onScroll(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float distanceX, float distanceY) { postTranslate(-distanceX, -distanceY); return true; } diff --git a/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java b/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java index 4ee7747b4..03f9b1c62 100644 --- a/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java +++ b/image/src/main/java/com/yalantis/ucrop/view/OverlayView.java @@ -8,6 +8,7 @@ import android.graphics.Path; import android.graphics.RectF; import android.graphics.Region; +import android.os.Build; import android.util.AttributeSet; import android.view.MotionEvent; import android.view.View; @@ -16,6 +17,7 @@ import androidx.annotation.IntDef; import androidx.annotation.IntRange; import androidx.annotation.NonNull; +import androidx.core.content.ContextCompat; import com.yalantis.ucrop.callback.OverlayViewChangeListener; import com.yalantis.ucrop.util.RectUtils; @@ -436,12 +438,21 @@ private int getCurrentTouchIndex(float touchX, float touchY) { * * @param canvas - valid canvas object */ + @SuppressWarnings("deprecation") protected void drawDimmedLayer(@NonNull Canvas canvas) { canvas.save(); if (mCircleDimmedLayer) { - canvas.clipPath(mCircularPath, Region.Op.DIFFERENCE); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + canvas.clipOutPath(mCircularPath); + } else { + canvas.clipPath(mCircularPath, Region.Op.DIFFERENCE); + } } else { - canvas.clipRect(mCropViewRect, Region.Op.DIFFERENCE); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + canvas.clipOutRect(mCropViewRect); + } else { + canvas.clipRect(mCropViewRect, Region.Op.DIFFERENCE); + } } canvas.drawColor(mDimmedColor); canvas.restore(); @@ -458,6 +469,7 @@ protected void drawDimmedLayer(@NonNull Canvas canvas) { * * @param canvas - valid canvas object */ + @SuppressWarnings("deprecation") protected void drawCropGrid(@NonNull Canvas canvas) { if (mShowCropGrid) { if (mGridPoints == null && !mCropViewRect.isEmpty()) { @@ -494,11 +506,19 @@ protected void drawCropGrid(@NonNull Canvas canvas) { mTempRect.set(mCropViewRect); mTempRect.inset(mCropRectCornerTouchAreaLineLength, -mCropRectCornerTouchAreaLineLength); - canvas.clipRect(mTempRect, Region.Op.DIFFERENCE); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + canvas.clipOutRect(mTempRect); + } else { + canvas.clipRect(mTempRect, Region.Op.DIFFERENCE); + } mTempRect.set(mCropViewRect); mTempRect.inset(-mCropRectCornerTouchAreaLineLength, mCropRectCornerTouchAreaLineLength); - canvas.clipRect(mTempRect, Region.Op.DIFFERENCE); + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + canvas.clipOutRect(mTempRect); + } else { + canvas.clipRect(mTempRect, Region.Op.DIFFERENCE); + } canvas.drawRect(mCropViewRect, mCropFrameCornersPaint); @@ -513,7 +533,7 @@ protected void drawCropGrid(@NonNull Canvas canvas) { protected void processStyledAttributes(@NonNull TypedArray a) { mCircleDimmedLayer = a.getBoolean(R.styleable.ucrop_UCropView_ucrop_circle_dimmed_layer, DEFAULT_CIRCLE_DIMMED_LAYER); mDimmedColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_dimmed_color, - getResources().getColor(R.color.ucrop_color_default_dimmed)); + ContextCompat.getColor(getContext(), R.color.ucrop_color_default_dimmed)); mDimmedStrokePaint.setColor(mDimmedColor); mDimmedStrokePaint.setStyle(Paint.Style.STROKE); mDimmedStrokePaint.setStrokeWidth(1); @@ -532,7 +552,7 @@ private void initCropFrameStyle(@NonNull TypedArray a) { int cropFrameStrokeSize = a.getDimensionPixelSize(R.styleable.ucrop_UCropView_ucrop_frame_stroke_size, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_frame_stoke_width)); int cropFrameColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_frame_color, - getResources().getColor(R.color.ucrop_color_default_crop_frame)); + ContextCompat.getColor(getContext(), R.color.ucrop_color_default_crop_frame)); mCropFramePaint.setStrokeWidth(cropFrameStrokeSize); mCropFramePaint.setColor(cropFrameColor); mCropFramePaint.setStyle(Paint.Style.STROKE); @@ -549,7 +569,7 @@ private void initCropGridStyle(@NonNull TypedArray a) { int cropGridStrokeSize = a.getDimensionPixelSize(R.styleable.ucrop_UCropView_ucrop_grid_stroke_size, getResources().getDimensionPixelSize(R.dimen.ucrop_default_crop_grid_stoke_width)); int cropGridColor = a.getColor(R.styleable.ucrop_UCropView_ucrop_grid_color, - getResources().getColor(R.color.ucrop_color_default_crop_grid)); + ContextCompat.getColor(getContext(), R.color.ucrop_color_default_crop_grid)); mCropGridPaint.setStrokeWidth(cropGridStrokeSize); mCropGridPaint.setColor(cropGridColor); diff --git a/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java b/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java index ac0914a51..5edfca37d 100644 --- a/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java +++ b/image/src/main/java/com/yalantis/ucrop/view/TransformImageView.java @@ -77,7 +77,7 @@ public void setScaleType(ScaleType scaleType) { public int getMaxBitmapSize() { if (mMaxBitmapSize <= 0) { - mMaxBitmapSize = BitmapLoadUtils.calculateMaxBitmapSize(getContext()); + mMaxBitmapSize = BitmapLoadUtils.calculateMaxBitmapSize(); } return mMaxBitmapSize; } diff --git a/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java b/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java index 93c3fd9f6..c66559fd8 100644 --- a/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java +++ b/image/src/main/java/com/yalantis/ucrop/view/widget/AspectRatioTextView.java @@ -124,7 +124,7 @@ private void init(@NonNull TypedArray a) { setTitle(); - int activeColor = getResources().getColor(R.color.ucrop_color_widget_active); + int activeColor = ContextCompat.getColor(getContext(), R.color.ucrop_color_widget_active); applyActiveColor(activeColor); a.recycle(); diff --git a/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java b/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java index cb4004294..bbbbb6d96 100644 --- a/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java +++ b/image/src/main/java/com/yalantis/ucrop/view/widget/HorizontalProgressWheelView.java @@ -139,7 +139,7 @@ private void init() { mProgressLinePaint = new Paint(Paint.ANTI_ALIAS_FLAG); mProgressLinePaint.setStyle(Paint.Style.STROKE); mProgressLinePaint.setStrokeWidth(mProgressLineWidth); - mProgressLinePaint.setColor(getResources().getColor(R.color.ucrop_color_progress_wheel_line)); + mProgressLinePaint.setColor(ContextCompat.getColor(getContext(), R.color.ucrop_color_progress_wheel_line)); mProgressMiddleLinePaint = new Paint(mProgressLinePaint); mProgressMiddleLinePaint.setColor(mMiddleLineColor); diff --git a/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java b/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java index 711191777..07ad97ce6 100644 --- a/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java +++ b/image/src/main/java/me/minetsh/imaging/IMGEditActivity.java @@ -4,6 +4,7 @@ import android.graphics.Bitmap; import android.graphics.BitmapFactory; import android.net.Uri; +import android.os.Build; import android.text.TextUtils; import java.io.FileNotFoundException; @@ -33,6 +34,7 @@ public void onCreated() { } + @SuppressWarnings("deprecation") @Override public Bitmap getBitmap() { Intent intent = getIntent(); @@ -40,7 +42,12 @@ public Bitmap getBitmap() { return null; } - Uri uri = intent.getParcelableExtra(EXTRA_IMAGE_URI); + Uri uri; + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) { + uri = intent.getParcelableExtra(EXTRA_IMAGE_URI, Uri.class); + } else { + uri = intent.getParcelableExtra(EXTRA_IMAGE_URI); + } if (uri == null) { return null; } @@ -48,7 +55,7 @@ public Bitmap getBitmap() { IMGDecoder decoder = null; String path = uri.getPath(); - if (!TextUtils.isEmpty(path)) { + if (!TextUtils.isEmpty(path) && uri.getScheme() != null) { switch (uri.getScheme()) { case "asset": decoder = new IMGAssetFileDecoder(this, uri); diff --git a/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java b/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java index 9faf76a1e..b5a53fa2f 100644 --- a/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java +++ b/image/src/main/java/me/minetsh/imaging/IMGEditBaseActivity.java @@ -18,8 +18,7 @@ * Created by felix on 2017/12/5 下午3:08. */ -abstract class IMGEditBaseActivity extends AppCompatActivity implements View.OnClickListener, - IMGTextEditDialog.Callback, RadioGroup.OnCheckedChangeListener, +abstract class IMGEditBaseActivity extends AppCompatActivity implements IMGTextEditDialog.Callback, RadioGroup.OnCheckedChangeListener, DialogInterface.OnShowListener, DialogInterface.OnDismissListener { public static final int OP_HIDE = -1; @@ -61,34 +60,19 @@ private void initViews() { mColorGroup.setOnCheckedChangeListener(this); mLayoutOpSub = findViewById(R.id.layout_op_sub); - } - @Override - public void onClick(View v) { - int vid = v.getId(); - if (vid == R.id.rb_doodle) { - onModeClick(IMGMode.DOODLE); - } else if (vid == R.id.btn_text) { - onTextModeClick(); - } else if (vid == R.id.rb_mosaic) { - onModeClick(IMGMode.MOSAIC); - } else if (vid == R.id.btn_clip) { - onModeClick(IMGMode.CLIP); - } else if (vid == R.id.btn_undo) { - onUndoClick(); - } else if (vid == R.id.tv_done) { - onDoneClick(); - } else if (vid == R.id.tv_cancel) { - onCancelClick(); - } else if (vid == R.id.ib_clip_cancel) { - onCancelClipClick(); - } else if (vid == R.id.ib_clip_done) { - onDoneClipClick(); - } else if (vid == R.id.tv_clip_reset) { - onResetClipClick(); - } else if (vid == R.id.ib_clip_rotate) { - onRotateClipClick(); - } + findViewById(R.id.ib_clip_rotate).setOnClickListener(v -> onRotateClipClick()); + findViewById(R.id.ib_clip_cancel).setOnClickListener(v -> onCancelClipClick()); + findViewById(R.id.tv_clip_reset).setOnClickListener(v -> onResetClipClick()); + findViewById(R.id.ib_clip_done).setOnClickListener(v -> onDoneClipClick()); + + findViewById(R.id.rb_doodle).setOnClickListener(v -> onModeClick(IMGMode.DOODLE)); + findViewById(R.id.btn_text).setOnClickListener(v -> onTextModeClick()); + findViewById(R.id.rb_mosaic).setOnClickListener(v -> onModeClick(IMGMode.MOSAIC)); + findViewById(R.id.btn_clip).setOnClickListener(v -> onModeClick(IMGMode.CLIP)); + findViewById(R.id.btn_undo).setOnClickListener(v -> onUndoClick()); + findViewById(R.id.tv_done).setOnClickListener(v -> onDoneClick()); + findViewById(R.id.tv_cancel).setOnClickListener(v -> onCancelClick()); } public void updateModeUI() { diff --git a/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java b/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java index 026c8675e..29f44f591 100644 --- a/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java +++ b/image/src/main/java/me/minetsh/imaging/IMGTextEditDialog.java @@ -8,9 +8,10 @@ import android.view.View; import android.view.ViewGroup; import android.view.Window; -import android.widget.EditText; import android.widget.RadioGroup; +import com.google.android.material.textfield.TextInputEditText; + import me.minetsh.imaging.core.IMGText; import me.minetsh.imaging.view.IMGColorGroup; @@ -23,7 +24,7 @@ public class IMGTextEditDialog extends Dialog implements View.OnClickListener, private static final String TAG = "IMGTextEditDialog"; private final Callback mCallback; - private EditText mEditText; + private TextInputEditText mEditText; private IMGText mDefaultText; private IMGColorGroup mColorGroup; diff --git a/image/src/main/java/me/minetsh/imaging/core/IMGImage.java b/image/src/main/java/me/minetsh/imaging/core/IMGImage.java index ede5a2da7..49c91171c 100644 --- a/image/src/main/java/me/minetsh/imaging/core/IMGImage.java +++ b/image/src/main/java/me/minetsh/imaging/core/IMGImage.java @@ -507,7 +507,7 @@ public void onDrawImage(Canvas canvas) { } public int onDrawMosaicsPath(Canvas canvas) { - int layerCount = canvas.saveLayer(mFrame, null, Canvas.ALL_SAVE_FLAG); + int layerCount = canvas.saveLayer(mFrame, null); if (!isMosaicEmpty()) { canvas.save(); diff --git a/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java b/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java index ea07f6735..9089dd31a 100644 --- a/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java +++ b/image/src/main/java/me/minetsh/imaging/view/IMGStickerXText.java @@ -3,6 +3,8 @@ import android.content.res.Resources; import android.graphics.Canvas; import android.graphics.Paint; +import android.graphics.text.LineBreaker; +import android.os.Build; import android.text.Layout; import android.text.StaticLayout; import android.text.TextPaint; @@ -16,7 +18,6 @@ */ public class IMGStickerXText extends IMGStickerX { - private final TextPaint mTextPaint = new TextPaint(Paint.ANTI_ALIAS_FLAG); private StaticLayout mTextLayout; @@ -28,12 +29,28 @@ public IMGStickerXText(IMGText text) { setText(text); } + @SuppressWarnings("deprecation") public void setText(IMGText text) { - mTextPaint.setColor(text.getColor()); - mTextLayout = new StaticLayout(text.getText(), mTextPaint, - Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f), - Layout.Alignment.ALIGN_NORMAL, 1f, 0, false); + if (Build.VERSION.SDK_INT < Build.VERSION_CODES.M) { + mTextLayout = new StaticLayout(text.getText(), mTextPaint, + Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f), + Layout.Alignment.ALIGN_NORMAL, 1f, 0, false); + } else { + StaticLayout.Builder builder = + StaticLayout.Builder.obtain(text.getText(), 0, text.length(), mTextPaint, Math.round(Resources.getSystem().getDisplayMetrics().widthPixels * 0.8f)) + .setAlignment(Layout.Alignment.ALIGN_NORMAL) + .setLineSpacing(0.f, 1.f) + .setIncludePad(false); + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + builder.setJustificationMode(LineBreaker.JUSTIFICATION_MODE_INTER_WORD); + } + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.P) { + builder.setUseLineSpacingFromFallbacks(true); + } + mTextLayout = builder.build(); + } float width = 0f; for (int i = 0; i < mTextLayout.getLineCount(); i++) { diff --git a/image/src/main/java/me/minetsh/imaging/view/IMGView.java b/image/src/main/java/me/minetsh/imaging/view/IMGView.java index 633f1fc94..842fa6d51 100644 --- a/image/src/main/java/me/minetsh/imaging/view/IMGView.java +++ b/image/src/main/java/me/minetsh/imaging/view/IMGView.java @@ -23,6 +23,8 @@ import android.view.ViewParent; import android.widget.FrameLayout; +import androidx.annotation.NonNull; + import me.minetsh.imaging.core.IMGImage; import me.minetsh.imaging.core.IMGMode; import me.minetsh.imaging.core.IMGPath; @@ -458,7 +460,7 @@ protected void onDetachedFromWindow() { } @Override - public boolean onScale(ScaleGestureDetector detector) { + public boolean onScale(@NonNull ScaleGestureDetector detector) { if (mPointerCount > 1) { mImage.onScale(detector.getScaleFactor(), getScrollX() + detector.getFocusX(), @@ -470,7 +472,7 @@ public boolean onScale(ScaleGestureDetector detector) { } @Override - public boolean onScaleBegin(ScaleGestureDetector detector) { + public boolean onScaleBegin(@NonNull ScaleGestureDetector detector) { if (mPointerCount > 1) { mImage.onScaleBegin(); return true; @@ -479,7 +481,7 @@ public boolean onScaleBegin(ScaleGestureDetector detector) { } @Override - public void onScaleEnd(ScaleGestureDetector detector) { + public void onScaleEnd(@NonNull ScaleGestureDetector detector) { mImage.onScaleEnd(); } @@ -532,7 +534,7 @@ public boolean onRemove(V stickerView) { } @Override - public void onAnimationStart(Animator animation) { + public void onAnimationStart(@NonNull Animator animation) { if (DEBUG) { Log.d(TAG, "onAnimationStart"); } @@ -540,7 +542,7 @@ public void onAnimationStart(Animator animation) { } @Override - public void onAnimationEnd(Animator animation) { + public void onAnimationEnd(@NonNull Animator animation) { if (DEBUG) { Log.d(TAG, "onAnimationEnd"); } @@ -550,7 +552,7 @@ public void onAnimationEnd(Animator animation) { } @Override - public void onAnimationCancel(Animator animation) { + public void onAnimationCancel(@NonNull Animator animation) { if (DEBUG) { Log.d(TAG, "onAnimationCancel"); } @@ -558,7 +560,7 @@ public void onAnimationCancel(Animator animation) { } @Override - public void onAnimationRepeat(Animator animation) { + public void onAnimationRepeat(@NonNull Animator animation) { // empty implementation. } @@ -610,17 +612,17 @@ IMGPath toPath() { class MoveAdapter extends GestureDetector.SimpleOnGestureListener { @Override - public boolean onDown(MotionEvent e) { + public boolean onDown(@NonNull MotionEvent e) { return true; } @Override - public boolean onScroll(MotionEvent e1, MotionEvent e2, float distanceX, float distanceY) { + public boolean onScroll(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float distanceX, float distanceY) { return IMGView.this.onScroll(distanceX, distanceY); } @Override - public boolean onFling(MotionEvent e1, MotionEvent e2, float velocityX, float velocityY) { + public boolean onFling(@NonNull MotionEvent e1, @NonNull MotionEvent e2, float velocityX, float velocityY) { // TODO return super.onFling(e1, e2, velocityX, velocityY); } diff --git a/image/src/main/res/layout/image_edit_clip_layout.xml b/image/src/main/res/layout/image_edit_clip_layout.xml index 9901af82a..8af6ea3b6 100644 --- a/image/src/main/res/layout/image_edit_clip_layout.xml +++ b/image/src/main/res/layout/image_edit_clip_layout.xml @@ -21,7 +21,6 @@ android:layout_marginStart="20dp" android:background="@null" android:contentDescription="@string/image_rotate" - android:onClick="onClick" android:src="@drawable/image_btn_rotate" /> @@ -62,7 +59,6 @@ android:layout_marginEnd="40dp" android:background="@null" android:contentDescription="@string/image_done" - android:onClick="onClick" android:src="@drawable/image_btn_ok" /> diff --git a/image/src/main/res/layout/image_edit_opt_layout.xml b/image/src/main/res/layout/image_edit_opt_layout.xml index 5ddb04a44..8f0b993fe 100644 --- a/image/src/main/res/layout/image_edit_opt_layout.xml +++ b/image/src/main/res/layout/image_edit_opt_layout.xml @@ -21,7 +21,6 @@ android:layout_marginTop="8dp" android:background="@null" android:contentDescription="@string/image_cancel" - android:onClick="onClick" android:src="@drawable/image_btn_cancel" /> @@ -122,7 +120,6 @@ android:layout_height="wrap_content" android:background="@null" android:contentDescription="@string/image_undo" - android:onClick="onClick" android:src="@drawable/image_btn_undo" /> @@ -142,7 +139,6 @@ android:layout_height="wrap_content" android:button="@drawable/image_btn_doodle" android:gravity="center" - android:onClick="onClick" android:textColor="#FFF" /> @@ -170,7 +165,6 @@ android:layout_height="wrap_content" android:button="@drawable/image_btn_mosaic" android:gravity="center" - android:onClick="onClick" android:textColor="#FFF" /> diff --git a/image/src/main/res/layout/image_text_dialog.xml b/image/src/main/res/layout/image_text_dialog.xml index 33ba0c8c0..d6a0fc0aa 100644 --- a/image/src/main/res/layout/image_text_dialog.xml +++ b/image/src/main/res/layout/image_text_dialog.xml @@ -31,20 +31,26 @@ - + android:layout_weight="1"> + + + diff --git a/image/src/main/res/layout/ucrop_controls.xml b/image/src/main/res/layout/ucrop_controls.xml index 531f7fed7..eb124745f 100644 --- a/image/src/main/res/layout/ucrop_controls.xml +++ b/image/src/main/res/layout/ucrop_controls.xml @@ -82,7 +82,7 @@ android:src="@drawable/ucrop_rotate" /> @@ -99,7 +99,7 @@ android:src="@drawable/ucrop_scale" /> diff --git a/image/src/main/res/layout/ucrop_fragment_photobox.xml b/image/src/main/res/layout/ucrop_fragment_photobox.xml deleted file mode 100644 index 70e97247e..000000000 --- a/image/src/main/res/layout/ucrop_fragment_photobox.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - - - - - diff --git a/image/src/main/res/values/image_colors.xml b/image/src/main/res/values/image_colors.xml index 26e1d1263..cd8d6546a 100644 --- a/image/src/main/res/values/image_colors.xml +++ b/image/src/main/res/values/image_colors.xml @@ -16,7 +16,6 @@ #000 #673AB7 #20242F - #B3BECE #FFF @@ -26,7 +25,6 @@ @color/ucrop_color_black @color/ucrop_color_white @color/ucrop_color_blaze_orange - @color/ucrop_color_heather @color/ucrop_color_blaze_orange @color/ucrop_color_ebony_clay @color/ucrop_color_blaze_orange diff --git a/libfenrir/build.gradle b/libfenrir/build.gradle index c725d93e3..6e589b773 100644 --- a/libfenrir/build.gradle +++ b/libfenrir/build.gradle @@ -29,6 +29,7 @@ android { } } ndk { + //noinspection ChromeOsAbiSupport abiFilters "arm64-v8a", "armeabi-v7a", "x86_64" } } diff --git a/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java b/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java index d74de4433..c1ed0cfe7 100644 --- a/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java +++ b/libfenrir/src/main/java/com/github/luben/zstd/ZstdBufferDecompressingStreamNoFinalizer.java @@ -37,7 +37,7 @@ long initDStream(long stream) { } @Override - long decompressStream(long stream, ByteBuffer dst, int dstOffset, int dstSize, ByteBuffer src, int srcOffset, int srcSize) { + long decompressStream(long stream, ByteBuffer dst, int dstBufPos, int dstSize, ByteBuffer src, int srcBufPos, int srcSize) { if (!src.hasArray()) { throw new IllegalArgumentException("provided source ByteBuffer lacks array"); } @@ -47,7 +47,10 @@ long decompressStream(long stream, ByteBuffer dst, int dstOffset, int dstSize, B byte[] targetArr = dst.array(); byte[] sourceArr = src.array(); - return decompressStreamNative(stream, targetArr, dstOffset, dstSize, sourceArr, srcOffset, srcSize); + // We are interested in array data corresponding to the pos represented by the ByteBuffer view. + // A ByteBuffer may share an underlying array with other ByteBuffers. In such scenario, we need to adjust the + // index of the array by adding an offset using arrayOffset(). + return decompressStreamNative(stream, targetArr, dstBufPos + dst.arrayOffset(), dstSize, sourceArr, srcBufPos + src.arrayOffset(), srcSize); } public static int recommendedTargetBufferSize() { diff --git a/libfenrir/src/main/jni/CMakeLists.txt b/libfenrir/src/main/jni/CMakeLists.txt index 0b77cbd51..a83960000 100644 --- a/libfenrir/src/main/jni/CMakeLists.txt +++ b/libfenrir/src/main/jni/CMakeLists.txt @@ -123,49 +123,53 @@ target_include_directories(rlottie PRIVATE rlottie/src/vector/stb) add_library(libyuv STATIC + animation/libyuv/source/compare.cc animation/libyuv/source/compare_common.cc animation/libyuv/source/compare_gcc.cc + animation/libyuv/source/compare_msa.cc animation/libyuv/source/compare_neon64.cc - animation/libyuv/source/compare_win.cc - animation/libyuv/source/compare.cc + animation/libyuv/source/compare_neon.cc animation/libyuv/source/convert_argb.cc + animation/libyuv/source/convert.cc animation/libyuv/source/convert_from_argb.cc animation/libyuv/source/convert_from.cc animation/libyuv/source/convert_jpeg.cc animation/libyuv/source/convert_to_argb.cc animation/libyuv/source/convert_to_i420.cc - animation/libyuv/source/convert.cc animation/libyuv/source/cpu_id.cc animation/libyuv/source/mjpeg_decoder.cc animation/libyuv/source/mjpeg_validate.cc animation/libyuv/source/planar_functions.cc animation/libyuv/source/rotate_any.cc animation/libyuv/source/rotate_argb.cc - animation/libyuv/source/rotate_lsx.cc + animation/libyuv/source/rotate.cc animation/libyuv/source/rotate_common.cc animation/libyuv/source/rotate_gcc.cc + animation/libyuv/source/rotate_lsx.cc + animation/libyuv/source/rotate_msa.cc animation/libyuv/source/rotate_neon64.cc - animation/libyuv/source/rotate_win.cc - animation/libyuv/source/rotate.cc + animation/libyuv/source/rotate_neon.cc animation/libyuv/source/row_any.cc - animation/libyuv/source/row_rvv.cc - animation/libyuv/source/row_lasx.cc - animation/libyuv/source/row_lsx.cc animation/libyuv/source/row_common.cc animation/libyuv/source/row_gcc.cc + animation/libyuv/source/row_lasx.cc + animation/libyuv/source/row_lsx.cc + animation/libyuv/source/row_msa.cc animation/libyuv/source/row_neon64.cc - animation/libyuv/source/row_win.cc + animation/libyuv/source/row_neon.cc + animation/libyuv/source/row_rvv.cc animation/libyuv/source/scale_any.cc animation/libyuv/source/scale_argb.cc + animation/libyuv/source/scale.cc animation/libyuv/source/scale_common.cc - animation/libyuv/source/scale_lsx.cc animation/libyuv/source/scale_gcc.cc + animation/libyuv/source/scale_lsx.cc + animation/libyuv/source/scale_msa.cc animation/libyuv/source/scale_neon64.cc - animation/libyuv/source/scale_win.cc - animation/libyuv/source/scale.cc - animation/libyuv/source/video_common.cc + animation/libyuv/source/scale_neon.cc animation/libyuv/source/scale_rgb.cc - animation/libyuv/source/scale_uv.cc) + animation/libyuv/source/scale_uv.cc + animation/libyuv/source/video_common.cc) target_compile_options(libyuv PRIVATE -ffast-math ${OPTIMIZE_NORMAL} -funroll-loops -fno-strict-aliasing -fno-math-errno ${SYM_VISIBILITY}) target_include_directories(libyuv PRIVATE diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h index 4f6d2536e..88619a4f6 100644 --- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h +++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert.h @@ -367,6 +367,23 @@ int I212ToI422(const uint16_t* src_y, int width, int height); +#define H212ToH420 I212ToI420 +LIBYUV_API +int I212ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + #define H412ToH444 I412ToI444 LIBYUV_API int I412ToI444(const uint16_t* src_y, @@ -384,6 +401,23 @@ int I412ToI444(const uint16_t* src_y, int width, int height); +#define H412ToH420 I412ToI420 +LIBYUV_API +int I412ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + #define I412ToI012 I410ToI010 #define H410ToH010 I410ToI010 #define H412ToH012 I410ToI010 diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h index 8e4562efc..35eeac9b2 100644 --- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h +++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/convert_argb.h @@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h index 3488d2568..5b244d77e 100644 --- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h +++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h @@ -807,6 +807,7 @@ extern "C" { #define HAS_ABGRTOYROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_BGRATOYROW_RVV +#define HAS_COPYROW_RVV #define HAS_I400TOARGBROW_RVV #define HAS_I422ALPHATOARGBROW_RVV #define HAS_I422TOARGBROW_RVV @@ -815,12 +816,15 @@ extern "C" { #define HAS_I444ALPHATOARGBROW_RVV #define HAS_I444TOARGBROW_RVV #define HAS_I444TORGB24ROW_RVV +#define HAS_INTERPOLATEROW_RVV #define HAS_J400TOARGBROW_RVV #define HAS_MERGEARGBROW_RVV #define HAS_MERGERGBROW_RVV +#define HAS_MERGEUVROW_RVV #define HAS_MERGEXRGBROW_RVV #define HAS_SPLITARGBROW_RVV #define HAS_SPLITRGBROW_RVV +#define HAS_SPLITUVROW_RVV #define HAS_SPLITXRGBROW_RVV #define HAS_RAWTOARGBROW_RVV #define HAS_RAWTORGB24ROW_RVV @@ -832,9 +836,6 @@ extern "C" { #define HAS_RGB24TOYROW_RVV #define HAS_RGBATOYROW_RVV #define HAS_RGBATOYJROW_RVV -#define HAS_SPLITARGBROW_RVV -#define HAS_SPLITRGBROW_RVV -#define HAS_SPLITXRGBROW_RVV #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -2242,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -2403,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3038,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3368,15 +3378,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -3404,7 +3414,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, int width); void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -3417,15 +3427,15 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, int width); void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -5858,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h index 9568200e8..b6623dbbe 100644 --- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h +++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1869 +#define LIBYUV_VERSION 1871 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc deleted file mode 100644 index 7640d9468..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/compare_mmi.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// Hakmem method for hamming distance. -uint32_t HammingDistance_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; - uint64_t c1 = 0x5555555555555555; - uint64_t c2 = 0x3333333333333333; - uint64_t c3 = 0x0f0f0f0f0f0f0f0f; - uint32_t c4 = 0x01010101; - uint64_t s1 = 1, s2 = 2, s3 = 4; - __asm__ volatile( - "1: \n\t" - "ldc1 %[ta], 0(%[src_a]) \n\t" - "ldc1 %[tb], 0(%[src_b]) \n\t" - "xor %[temp], %[ta], %[tb] \n\t" - "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 - "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 - "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 - "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) - "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 - "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 - "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t - "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 - "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) - "and %[temp1], %[temp1], %[c3] \n\t" //&c3 - "dmfc1 $t0, %[temp1] \n\t" - "dsrl32 $t0, $t0, 0 \n\t " - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "dmfc1 $t0, %[temp1] \n\t" - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "daddiu %[src_a], %[src_a], 8 \n\t" - "daddiu %[src_b], %[src_b], 8 \n\t" - "addiu %[count], %[count], -8 \n\t" - "bgtz %[count], 1b \n\t" - "nop \n\t" - : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), - [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), - [temp1] "+f"(temp1) - : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), - [s2] "f"(s2), [s3] "f"(s3) - : "memory"); - return diff; -} - -uint32_t SumSquareError_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - uint32_t sse_hi = 0u, sse_lo = 0u; - - uint64_t src1, src2; - uint64_t diff, diff_hi, diff_lo; - uint64_t sse_sum, sse_tmp; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" - - "1: \n\t" - "ldc1 %[src1], 0x00(%[src_a]) \n\t" - "ldc1 %[src2], 0x00(%[src_b]) \n\t" - "pasubub %[diff], %[src1], %[src2] \n\t" - "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" - "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" - "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - - "daddiu %[src_a], %[src_a], 0x08 \n\t" - "daddiu %[src_b], %[src_b], 0x08 \n\t" - "daddiu %[count], %[count], -0x08 \n\t" - "bnez %[count], 1b \n\t" - - "mfc1 %[sse_lo], %[sse_sum] \n\t" - "mfhc1 %[sse_hi], %[sse_sum] \n\t" - "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" - : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), - [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), - [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), - [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) - : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), - [mask] "f"(mask) - : "memory"); - - return sse; -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc deleted file mode 100644 index 9bb27f1dd..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/compare_win.cc +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -#if defined(_MSC_VER) -#include // For __popcnt -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT - src_a += 4; - src_b += 4; - diff += __popcnt(x); - } - return diff; -} - -__declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - - wloop: - movdqu xmm1, [eax] - lea eax, [eax + 16] - movdqu xmm2, [edx] - lea edx, [edx + 16] - movdqa xmm3, xmm1 // abs trick - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - jg wloop - - pshufd xmm1, xmm0, 0xee - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 0x01 - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} - -#ifdef HAS_SUMSQUAREERROR_AVX2 -// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable : 4752) -__declspec(naked) uint32_t - SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - vpxor ymm0, ymm0, ymm0 // sum - vpxor ymm5, ymm5, ymm5 // constant 0 for unpck - sub edx, eax - - wloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + edx] - lea eax, [eax + 32] - vpsubusb ymm3, ymm1, ymm2 // abs difference trick - vpsubusb ymm2, ymm2, ymm1 - vpor ymm1, ymm2, ymm3 - vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. - vpunpckhbw ymm1, ymm1, ymm5 - vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. - vpmaddwd ymm1, ymm1, ymm1 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm0, ymm0, ymm2 - sub ecx, 32 - jg wloop - - vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpermq ymm1, ymm0, 0x02 // high + low lane. - vpaddd ymm0, ymm0, ymm1 - vmovd eax, xmm0 - vzeroupper - ret - } -} -#endif // HAS_SUMSQUAREERROR_AVX2 - -uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 -uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 -}; -uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 -}; -uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 -}; -uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 -}; - -__declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, xmmword ptr kHash16x33 - - wloop: - movdqu xmm1, [eax] // src[0-15] - lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, xmmword ptr kHashMul0 - movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] - movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] - pmulld xmm3, xmm5 - movdqa xmm5, xmmword ptr kHashMul1 - movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] - pmulld xmm4, xmm5 - movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] - movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] - pmulld xmm2, xmm5 - movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] - pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - sub ecx, 16 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} - -// Visual C 2012 required for AVX2. -#ifdef HAS_HASHDJB2_AVX2 -__declspec(naked) uint32_t - HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - vmovd xmm0, [esp + 12] // seed - - wloop: - vpmovzxbd xmm3, [eax] // src[0-3] - vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 - vpmovzxbd xmm4, [eax + 4] // src[4-7] - vpmulld xmm3, xmm3, xmmword ptr kHashMul0 - vpmovzxbd xmm2, [eax + 8] // src[8-11] - vpmulld xmm4, xmm4, xmmword ptr kHashMul1 - vpmovzxbd xmm1, [eax + 12] // src[12-15] - vpmulld xmm2, xmm2, xmmword ptr kHashMul2 - lea eax, [eax + 16] - vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm1, xmm1, xmm3 - vpshufd xmm2, xmm1, 0x0e // upper 2 dwords - vpaddd xmm1, xmm1,xmm2 - vpshufd xmm2, xmm1, 0x01 - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm0, xmm0, xmm1 - sub ecx, 16 - jg wloop - - vmovd eax, xmm0 // return hash - vzeroupper - ret - } -} -#endif // HAS_HASHDJB2_AVX2 - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert.cc index 075428d09..b11ab1bff 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/convert.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/convert.cc @@ -203,6 +203,99 @@ static int Planar16bitTo8bit(const uint16_t* src_y, return 0; } +static int I41xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, scale, kFilterBilinear); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, scale, kFilterBilinear); + } + return 0; +} + +static int I21xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + const int dy = FixedDiv(height, uv_height); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + } + return 0; +} + // Convert 10 bit YUV to 8 bit. LIBYUV_API int I010ToI420(const uint16_t* src_y, @@ -240,38 +333,9 @@ int I210ToI420(const uint16_t* src_y, int dst_stride_v, int width, int height) { - const int depth = 10; - const int scale = 1 << (24 - depth); - - if (width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - { - const int uv_width = SUBSAMPLE(width, 1, 1); - const int uv_height = SUBSAMPLE(height, 1, 1); - const int dy = FixedDiv(height, uv_height); - - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, - height); - ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, - dst_stride_u, src_u, dst_u, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); - ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, - dst_stride_v, src_v, dst_v, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); - } - return 0; + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); } LIBYUV_API @@ -310,35 +374,9 @@ int I410ToI420(const uint16_t* src_y, int dst_stride_v, int width, int height) { - const int depth = 10; - const int scale = 1 << (24 - depth); - - if (width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - { - const int uv_width = SUBSAMPLE(width, 1, 1); - const int uv_height = SUBSAMPLE(height, 1, 1); - - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, - height); - ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u, - dst_stride_u, src_u, dst_u, scale, kFilterBilinear); - ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v, - dst_stride_v, src_v, dst_v, scale, kFilterBilinear); - } - return 0; + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); } LIBYUV_API @@ -404,6 +442,26 @@ int I212ToI422(const uint16_t* src_y, 0, 12); } +LIBYUV_API +int I212ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + LIBYUV_API int I412ToI444(const uint16_t* src_y, int src_stride_y, @@ -425,6 +483,26 @@ int I412ToI444(const uint16_t* src_y, 0, 12); } +LIBYUV_API +int I412ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + // Any Ix10 To I010 format with mirroring. static int Ix10ToI010(const uint16_t* src_y, int src_stride_y, @@ -955,6 +1033,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -995,6 +1078,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); @@ -1922,7 +2010,8 @@ int ARGBToI420Alpha(const uint8_t* src_argb, ARGBToYRow_C; void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, int width) = ARGBExtractAlphaRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -2045,7 +2134,8 @@ int ARGBToI420Alpha(const uint8_t* src_argb, ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); ARGBExtractAlphaRow(src_argb, dst_a, width); - ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a, width); + ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a, + width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc index f16c368d0..cc6560de6 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc @@ -5645,7 +5645,7 @@ int I420ToRGB565Dither(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = + uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc index 6e05876a0..c3d037c47 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/convert_from_argb.cc @@ -453,6 +453,11 @@ int ARGBToNV12(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -646,6 +651,11 @@ int ARGBToNV21(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -826,6 +836,11 @@ int ABGRToNV12(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -1007,6 +1022,11 @@ int ABGRToNV21(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -1721,7 +1741,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = + uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -3203,6 +3223,11 @@ int RAWToJNV21(const uint8_t* src_raw, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a row of uv. diff --git a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc index e741dc509..d115a2a10 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc @@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Copy plane for (y = 0; y < height; ++y) { @@ -545,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -631,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -4348,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -5560,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif + #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -5600,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif { int awidth = halfwidth * 2; @@ -5665,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_AVX2; } #endif + for (y = 0; y < height - 1; y += 2) { // Merge a row of U and V into a row of UV. HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc index 6797ff02b..8d3978c71 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/rotate.cc @@ -214,6 +214,11 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc index 9667f34c2..c72390108 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/rotate_argb.cc @@ -192,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc deleted file mode 100644 index f8de60834..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_mmi.cc +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -void TransposeWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (00 10 01 11 02 12 03 13) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (04 14 05 15 06 16 07 17) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (20 30 21 31 22 32 23 33) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (24 34 25 35 26 36 27 37) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (00 10 20 30 01 11 21 31) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (02 12 22 32 03 13 23 33) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (04 14 24 34 05 15 25 35) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (06 16 26 36 07 17 27 37) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (40 50 41 51 42 52 43 53) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (44 54 45 55 46 56 47 57) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (60 70 61 71 62 72 63 73) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (64 74 65 75 66 76 67 77) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (40 50 60 70 41 51 61 71) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (42 52 62 72 43 53 63 73) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (44 54 64 74 45 55 65 75) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (46 56 66 76 47 57 67 77) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (00 10 20 30 40 50 60 70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (01 11 21 31 41 51 61 71) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (02 12 22 32 42 52 62 72) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (03 13 23 33 43 53 63 73) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (04 14 24 34 44 54 64 74) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (05 15 25 35 45 55 65 75) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (06 16 26 36 46 56 66 76) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (07 17 27 37 47 57 67 77) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "daddi %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), - [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), - [dst_stride] "r"(dst_stride) - : "memory"); -} - -void TransposeUVWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "daddiu %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), - [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), - [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) - : "memory"); -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc deleted file mode 100644 index a78873f84..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/rotate_win.cc +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - - pop ebp - pop esi - pop edi - ret - } -} - -__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - // Read in the data from the source pointer. - // First round of bit swap. - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc index b7655b7b1..8be37fb58 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_common.cc @@ -342,7 +342,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { // or the upper byte for big endian. void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; for (x = 0; x < width - 1; x += 2) { diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc index aa4c0d11e..e94fd04df 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_gcc.cc @@ -738,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "movd %3,%%xmm6 \n" @@ -786,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc index 29ac9254d..1082ad80b 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc @@ -1182,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; int len = width / 16; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc index 573fc94de..e626072a9 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc @@ -794,10 +794,10 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0, __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009}; __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, - src_argb0, 48, src0, src1, src2, src3); - DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, - src_argb1, 48, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0, + 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1, + 48, src4, src5, src6, src7); vec0 = __lsx_vaddwev_h_bu(src0, src4); vec1 = __lsx_vaddwev_h_bu(src1, src5); vec2 = __lsx_vaddwev_h_bu(src2, src6); @@ -846,8 +846,8 @@ void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __m128i tmp0, tmp1, tmp2, tmp3; __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, - 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); tmp0 = __lsx_vshuf_b(src0, src0, shuf); tmp1 = __lsx_vshuf_b(src1, src1, shuf); tmp2 = __lsx_vshuf_b(src2, src2, shuf); @@ -879,8 +879,8 @@ void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __m128i tmp0, tmp1, tmp2, tmp3; __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, - 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); tmp0 = __lsx_vshuf_b(src0, src0, shuf); tmp1 = __lsx_vshuf_b(src1, src1, shuf); tmp2 = __lsx_vshuf_b(src2, src2, shuf); @@ -905,9 +905,7 @@ void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __lsx_vst(tmp3, dst_rgb, 0); } -void ARGBToRGB565Row_LSX(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { +void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; int len = width / 8; __m128i zero = __lsx_vldi(0); @@ -995,8 +993,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb, __m128i const_18 = __lsx_vldi(18); __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, - 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); tmp0 = __lsx_vpickev_h(src1, src0); tmp1 = __lsx_vpickod_h(src1, src0); tmp2 = __lsx_vpickev_h(src3, src2); @@ -1138,7 +1136,7 @@ void ARGBAttenuateRow_LSX(const uint8_t* src_argb, void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; int len = width / 8; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc deleted file mode 100644 index 362fd1cfc..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_mmi.cc +++ /dev/null @@ -1,7842 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "libyuv/row.h" - -#include // For memcpy and memset. - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - const uint64_t mask = 0xff000000ULL; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [mask] "f"(mask) - : "memory"); -} - -void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - uint64_t src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xff000000ULL; - const uint64_t mask2 = 0xc6; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) - : "memory"); -} - -void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x6c; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" - "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" - "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[src1], %[src1], %[zero] \n\t" - "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" - "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pextrh %[ftmp2], %[src1], %[zero] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" - - "daddiu %[src_raw], %[src_raw], 0x0c \n\t" - "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) - : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) - : "memory"); -} - -void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[5]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[c1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) - : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), - [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - uint64_t c4 = 0x0001000100010001; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psrlh %[a], %[src1], %[seven] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "xor %[a], %[a], %[c1] \n\t" - "paddb %[a], %[a], %[c4] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psrlh %[a], %[src1], %[four] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "psllh %[src0], %[a], %[four] \n\t" - "or %[a], %[src0], %[a] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), - [four] "f"(0x04) - : "memory"); -} - -void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) - : "memory"); -} - -void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - - "pextrh %[src0], %[ftmp1], %[two] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" - "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" - - "pextrh %[src0], %[ftmp2], %[two] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[one] \n\t" - "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[zero] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pinsrh_0 %[src1], %[src1], %[src0] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02) - : "memory"); -} - -void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), - [eleven] "f"(0x0b) - : "memory"); -} - -// dither4 is a row of 4 values from 4x4 dither matrix. -// The 4x4 matrix contains values to increase RGB. When converting to -// fewer bits (565) this provides an ordered dither. -// The order in the 4x4 matrix in first byte is upper left. -// The 4 values are passed as an int, then referenced as an array, so -// endian will not affect order of the original matrix. But the dither4 -// will containing the first pixel in the lower byte for little endian -// or the upper byte for big endian. -void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t c0 = 0x00ff00ff00ff00ff; - - __asm__ volatile( - "punpcklbh %[dither], %[dither], %[zero] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "paddh %[b], %[b], %[dither] \n\t" - "paddh %[g], %[g], %[dither] \n\t" - "paddh %[r], %[r], %[dither] \n\t" - "pcmpgth %[src0], %[b], %[c0] \n\t" - "or %[src0], %[src0], %[b] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[g], %[c0] \n\t" - "or %[src0], %[src0], %[g] \n\t" - "and %[g], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[r], %[c0] \n\t" - "or %[src0], %[src0], %[r] \n\t" - "and %[r], %[src0], %[c0] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), - [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) - : "memory"); -} - -void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[three] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - "psrlh %[a], %[a], %[seven] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[ten] \n\t" - "psllh %[a], %[a], %[fifteen] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), - [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) - : "memory"); -} - -void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[four] \n\t" - "psrlh %[g], %[g], %[four] \n\t" - "psrlh %[r], %[r], %[four] \n\t" - "psrlh %[a], %[a], %[four] \n\t" - - "psllh %[g], %[g], %[four] \n\t" - "psllh %[r], %[r], %[eight] \n\t" - "psllh %[a], %[a], %[twelve] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), - [twelve] "f"(0x0c) - : "memory"); -} - -void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ARGBToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0019008100420001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void BGRAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ABGRToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002F00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0042008100190001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGBAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" - "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" - "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" - "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" - "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGB24ToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RAWToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest, dest0, dest1, dest2, dest3; - uint64_t tmp0, tmp1; - const uint64_t shift = 0x08; - const uint64_t value = 0x80; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x0001004D0096001DULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest2], %[dest2], %[shift] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest3], %[dest3], %[shift] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), - [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), - [width] "r"(width) - : "memory"); -} - -void ARGBToUVJRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0015002a003f0002; - const uint64_t mask_v = 0x0002003f0035000a; - - __asm__ volatile( - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), - [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x1080108010801080; - uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) - : "memory"); -} - -void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest0_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest1_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest2_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest2_v], %[src0], %[c2] \n\t" - "psllh %[dest2_v], %[dest2_v], %[three] \n\t" - "or %[dest2_v], %[src1], %[dest2_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest3_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest3_v], %[src0], %[c2] \n\t" - "psllh %[dest3_v], %[dest3_v], %[three] \n\t" - "or %[dest3_v], %[src1], %[dest3_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" - "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [one] "f"(0x01) - : "memory"); -} - -void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest0_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest1_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest2_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest3_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" - "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" - "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), - [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [two] "f"(0x02), [one] "f"(0x01) - : "memory"); -} - -void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest0_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest0_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest1_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest1_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest2_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest2_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest3_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest3_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" - "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_argb4444] "r"(src_argb4444), - [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), - [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), - [two] "f"(0x02) - : "memory"); -} - -void ARGBToUV444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), - [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), - [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), - [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), - [dest3_v] "=&f"(ftmp[11]) - : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), - [eight] "f"(0x08) - : "memory"); -} - -void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x0080004D0096001DULL; - const uint64_t mask3 = 0xFF000000FF000000ULL; - const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "and %[src37], %[src], %[mask3] \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" - "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" - "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" - "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" - "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask4] \n\t" - "or %[dest], %[dest], %[src37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), - [src37] "=&f"(src37) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) - : "memory"); -} - -// Convert a row of image to Sepia tone. -void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { - uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x002300440011ULL; - const uint64_t mask2 = 0x002D00580016ULL; - const uint64_t mask3 = 0x003200620018ULL; - const uint64_t mask4 = 0xFF000000FF000000ULL; - const uint64_t shift = 0x07; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[dest37], %[dest], %[mask4] \n\t" - - "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "or %[dest], %[dest], %[dest37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), - [dest] "=&f"(dest) - : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [shift] "f"(shift) - : "memory"); -} - -// Apply color matrix to a row of image. Matrix is signed. -// TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, - dest3; - uint64_t matrix, matrix_hi, matrix_lo; - uint64_t tmp0, tmp1; - const uint64_t shift0 = 0x06; - const uint64_t shift1 = 0x08; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psraw %[dest0], %[dest0], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psraw %[dest1], %[dest1], %[shift0] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psraw %[dest2], %[dest2], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psraw %[dest3], %[dest3], %[shift0] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), - [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) - : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), - [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) - : "memory"); -} - -void ARGBShadeRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "punpcklbh %[value], %[value], %[value] \n\t" - - "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), - [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [value] "f"(value), [shift] "f"(shift) - : "memory"); -} - -void ARGBMultiplyRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; - uint64_t dest, dest_lo, dest_hi; - const uint64_t mask = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" - - "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" - "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -void ARGBAddRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "paddusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -void ARGBSubtractRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "psubusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -// Sobel functions which mimics SSSE3. -void SobelXRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - uint64_t y00 = 0, y10 = 0, y20 = 0; - uint64_t y02 = 0, y12 = 0, y22 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] - "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] - "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" // a+b - "paddh %[y20], %[y20], %[y10] \n\t" // c+b - "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c - - "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub - "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub - "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[sobel], %[y10], %[y20] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" - "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" - "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" - "paddh %[y20], %[y20], %[y10] \n\t" - "paddh %[y00], %[y00], %[y20] \n\t" - - "paddh %[y02], %[y02], %[y12] \n\t" - "paddh %[y22], %[y22], %[y12] \n\t" - "paddh %[y02], %[y02], %[y22] \n\t" - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[y00], %[y10], %[y20] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[src_y2], %[src_y2], 8 \n\t" - "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), - [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), - [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelYRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - uint64_t y00 = 0, y01 = 0, y02 = 0; - uint64_t y10 = 0, y11 = 0, y12 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] - "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] - "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" // a+b - "paddh %[y02], %[y02], %[y01] \n\t" // c+b - "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c - - "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub - "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub - "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[sobel], %[y02], %[y12] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" - "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" - "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" - "paddh %[y02], %[y02], %[y01] \n\t" - "paddh %[y00], %[y00], %[y02] \n\t" - - "paddh %[y10], %[y10], %[y11] \n\t" - "paddh %[y12], %[y12], %[y11] \n\t" - "paddh %[y10], %[y10], %[y12] \n\t" - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[y00], %[y02], %[y12] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), - [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), - [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - double temp[3]; - uint64_t c1 = 0xff000000ff000000; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] - "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" - // s7 s6 s5 s4 s3 s2 s1 s0 = a+b - "paddusb %[t2] , %[t0], %[t1] \n\t" - - // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 - "punpcklbh %[t0], %[t2], %[t2] \n\t" - - // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s1 s1 s1 s55 s0 s0 s0 - "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" - - // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s3 s3 s3 255 s2 s2 s2 - "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" - - // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 - "punpckhbh %[t0], %[t2], %[t2] \n\t" - - // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" - - // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - uint64_t tr = 0; - uint64_t tb = 0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" - "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" - "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] - "paddusb %[tr], %[tr], %[tb] \n\t" // g - "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" - - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(tr), [tb] "=&f"(tb) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_y] "r"(dst_y), [width] "r"(width) - : "memory"); -} - -void SobelXYRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - uint64_t temp[3]; - uint64_t result = 0; - uint64_t gb = 0; - uint64_t cr = 0; - uint64_t c1 = 0xffffffffffffffff; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" - "paddusb %[tg] , %[tr], %[tb] \n\t" // g - - // g3 b3 g2 b2 g1 b1 g0 b0 - "punpcklbh %[gb], %[tb], %[tg] \n\t" - // c3 r3 r2 r2 c1 r1 c0 r0 - "punpcklbh %[cr], %[tr], %[c1] \n\t" - // c1 r1 g1 b1 c0 r0 g0 b0 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" - // c3 r3 g3 b3 c2 r2 g2 b2 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" - - // g7 b7 g6 b6 g5 b5 g4 b4 - "punpckhbh %[gb], %[tb], %[tg] \n\t" - // c7 r7 c6 r6 c5 r5 c4 r4 - "punpckhbh %[cr], %[tr], %[c1] \n\t" - // c5 r5 g5 b5 c4 r4 g4 b4 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" - // c7 r7 g7 b7 c6 r6 g6 b6 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), - [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { - // Copy a Y to RGB. - uint64_t src, dest; - const uint64_t mask0 = 0x00ffffff00ffffffULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src], %[src], %[src] \n\t" - "punpcklhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -// TODO - respect YuvConstants -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, - const struct YuvConstants*, int width) { - uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x55; - const uint64_t mask2 = 0xAA; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = 0x4A354A354A354A35ULL; - const uint64_t mask5 = 0x0488048804880488ULL; - const uint64_t shift0 = 0x08; - const uint64_t shift1 = 0x06; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "pshufh %[src], %[src_lo], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_lo], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), - [shift1] "f"(shift1), [width] "r"(width) - : "memory"); -} - -void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x1b; - - src += width - 1; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[source], 0(%[src_ptr]) \n\t" - "gsldrc1 %[source], -7(%[src_ptr]) \n\t" - "punpcklbh %[src0], %[source], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask1] \n\t" - "punpckhbh %[src1], %[source], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "packushb %[dest], %[src1], %[src0] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void MirrorSplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src0, src1, dest0, dest1; - const uint64_t mask0 = 0x00ff00ff00ff00ffULL; - const uint64_t mask1 = 0x1b; - const uint64_t shift = 0x08; - - src_uv += (width - 1) << 1; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" - "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" - "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" - "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" - - "and %[dest0], %[src0], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "and %[dest1], %[src1], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" - - "psrlh %[dest0], %[src0], %[shift] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "psrlh %[dest1], %[src1], %[shift] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" - "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" - "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), - [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [shift] "f"(shift) - : "memory"); -} - -void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - src += (width - 1) * 4; - uint64_t temp = 0x0; - uint64_t shuff = 0x4e; // 01 00 11 10 - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[temp], 3(%[src]) \n\t" - "gsldrc1 %[temp], -4(%[src]) \n\t" - "pshufh %[temp], %[temp], %[shuff] \n\t" - "gssdrc1 %[temp], 0x0(%[dst]) \n\t" - "gssdlc1 %[temp], 0x7(%[dst]) \n\t" - - "daddiu %[src], %[src], -0x08 \n\t" - "daddiu %[dst], %[dst], 0x08 \n\t" - "daddiu %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [temp] "=&f"(temp) - : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) - : "memory"); -} - -void SplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" - - "and %[t2], %[t0], %[c0] \n\t" - "and %[t3], %[t1], %[c0] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" - - "psrlh %[t2], %[t0], %[shift] \n\t" - "psrlh %[t3], %[t1], %[shift] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" - - "daddiu %[src_uv], %[src_uv], 16 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [t3] "=&f"(temp[3]) - : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -void MergeUVRow_MMI(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - uint64_t temp[3]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" - "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" - "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" - "punpcklbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" - "punpckhbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" - - "daddiu %[src_u], %[src_u], 8 \n\t" - "daddiu %[src_v], %[src_v], 8 \n\t" - "daddiu %[dst_uv], %[dst_uv], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), - [width] "r"(width) - : "memory"); -} - -void SplitRGBRow_MMI(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - uint64_t src[4]; - uint64_t dest_hi, dest_lo, dest; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" - "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" - - "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" - "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" - "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" - "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), - [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), - [dstb_ptr] "r"(dst_b), [width] "r"(width) - : "memory"); -} - -void MergeRGBRow_MMI(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - uint64_t srcr, srcg, srcb, dest; - uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; - const uint64_t temp = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" - "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" - "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" - "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" - "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" - "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" - - "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" - "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" - "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" - "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" - - "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" - - "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" - "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" - "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), - [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), - [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), - [srcbz_lo] "=&f"(srcbz_lo) - : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), - [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) - : "memory"); -} - -// Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0) - : "memory"); -} - -// Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_MMI(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - // Output a row of Y values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t shift = 0x08; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "dsrl %[t0], %[t0], %[shift] \n\t" - "dsrl %[t1], %[t1], %[shift] \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Blend src_argb over src_argb1 and store to dst_argb. -// dst_argb may be src_argb or src_argb1. -// This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, - dest_lo; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_lo] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" - - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_hi] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[mask4] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void BlendPlaneRow_MMI(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - uint64_t source0, source1, dest, alph; - uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, - dest_lo; - uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" - "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" - "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" - "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" - "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" - "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" - "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" - - "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" - "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" - "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" - "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - - "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" - "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" - "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" - "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), - [alpha_r] "=&f"(alpha_rev) - : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), - [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -// Multiply source RGB by alpha and store to destination. -// This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; - const uint64_t mask0 = 0xFF; - const uint64_t mask1 = 0xFF000000FF000000ULL; - const uint64_t mask2 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "pshufh %[alpha], %[src_lo], %[mask0] \n\t" - "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pshufh %[alpha], %[src_hi], %[mask0] \n\t" - "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask2] \n\t" - "and %[src], %[src], %[mask1] \n\t" - "or %[dest], %[dest], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), - [width] "r"(width) - : "memory"); -} - -void ComputeCumulativeSumRow_MMI(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - int64_t row_sum[2] = {0, 0}; - uint64_t src, dest0, dest1, presrc0, presrc1, dest; - const uint64_t mask = 0x0; - - __asm__ volatile( - "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" - "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" - - "1: \n\t" - "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" - - "punpcklbh %[src], %[src], %[mask] \n\t" - "punpcklhw %[dest0], %[src], %[mask] \n\t" - "punpckhhw %[dest1], %[src], %[mask] \n\t" - - "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" - "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" - - "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" - "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" - - "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" - "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" - "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), - [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), - [presrc1] "=&f"(presrc1) - : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), - [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -// C version 2x2 -> 2x1. -void InterpolateRow_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - if (source_y_fraction == 0) { - __asm__ volatile( - "1: \n\t" - "ld $t0, 0x0(%[src_ptr]) \n\t" - "sd $t0, 0x0(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : - : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width) - : "memory"); - return; - } - if (source_y_fraction == 128) { - uint64_t uv = 0x0; - uint64_t uv_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" - "daddu $t0, %[src_ptr], %[stride] \n\t" - "gsldrc1 %[uv_stride], 0x0($t0) \n\t" - "gsldlc1 %[uv_stride], 0x7($t0) \n\t" - - "pavgb %[uv], %[uv], %[uv_stride] \n\t" - "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [stride] "r"((int64_t)src_stride) - : "memory"); - return; - } - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint64_t temp; - uint64_t data[4]; - uint64_t zero = 0x0; - uint64_t c0 = 0x0080008000800080; - uint64_t fy0 = 0x0100010001000100; - uint64_t shift = 0x8; - __asm__ volatile( - "pshufh %[fy1], %[fy1], %[zero] \n\t" - "psubh %[fy0], %[fy0], %[fy1] \n\t" - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" - "punpcklbh %[d0], %[t0], %[zero] \n\t" - "punpckhbh %[d1], %[t0], %[zero] \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" - "punpcklbh %[d2], %[t0], %[zero] \n\t" - "punpckhbh %[d3], %[t0], %[zero] \n\t" - - "pmullh %[d0], %[d0], %[fy0] \n\t" - "pmullh %[d2], %[d2], %[fy1] \n\t" - "paddh %[d0], %[d0], %[d2] \n\t" - "paddh %[d0], %[d0], %[c0] \n\t" - "psrlh %[d0], %[d0], %[shift] \n\t" - - "pmullh %[d1], %[d1], %[fy0] \n\t" - "pmullh %[d3], %[d3], %[fy1] \n\t" - "paddh %[d1], %[d1], %[d3] \n\t" - "paddh %[d1], %[d1], %[c0] \n\t" - "psrlh %[d1], %[d1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d1] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), - [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), - [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0), - [shift] "f"(shift), [zero] "f"(zero) - : "memory"); -} - -// Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | - ((shuffler[2] & 0x03) << 4) | - ((shuffler[3] & 0x03) << 6); - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[src], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "punpckhbh %[dest1], %[src], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest], %[dest0], %[dest1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I422ToYUY2Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void I422ToUYVYRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest; - const uint64_t mask0 = 0xff000000ff000000ULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[src], %[src], %[mask0] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[src], %[dest] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; - const uint64_t mask = 0xff000000ff000000ULL; - const uint64_t shift = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00ffffff00ffffffULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "punpckhbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I444ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - __asm__ volatile ( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - "punpcklbh %[u], %[u], %[zero] \n\t"//u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - "punpcklbh %[v], %[v], %[zero] \n\t"//v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// Also used for 420 -void I422ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t"//v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// 10 bit YUV to ARGB -void I210ToARGBRow_MMI(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" - - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask), [two]"f"(0x02), - [mask1]"f"(0x00ff00ff00ff00ff) - : "memory" - ); -} - -void I422AlphaToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v,a; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), [a]"=&f"(a), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [a_ptr]"r"(src_a), [zero]"f"(0x00), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -void I422ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(mask), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void I422ToARGB4444Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" - - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), - [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), - [alpha]"f"(-1) - : "memory" - ); -} - -void I422ToARGB1555Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" - - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [mask3]"f"(0x800000008000), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void I422ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void NV12ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV21ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV12ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [lmove1]"f"(0x18), - [one]"f"(0x1), [rmove1]"f"(0x8) - : "memory" - ); -} - -void NV21ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void NV12ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7) - : "memory" - ); -} - -void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void I422ToRGBARow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [alpha]"f"(-1) - : "memory" - ); -} - -void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile ( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32]"+&f"(v32) - : [dst_ptr]"r"(dst_argb), [width]"r"(width) - : "memory" - ); -} -// clang-format on - -// 10 bit YUV to ARGB -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc index b653765cc..4ed136381 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc @@ -1753,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "vdup.32 d7, %2 \n" // dither4 diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc index ba34ff4c9..74190d611 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc @@ -1979,7 +1979,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "dup v1.4s, %w3 \n" // dither4 diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc index be4c4a309..27e91a3be 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/row_rvv.cc @@ -29,78 +29,76 @@ extern "C" { // Fill YUV -> RGB conversion constants into vectors // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). -#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \ - v_br) \ - { \ - asm volatile("csrwi vxrm, 0"); \ - vl = __riscv_vsetvl_e8m1(w); \ - v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl); \ - v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl); \ - v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl); \ - v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl); \ - v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl); \ - v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1] + 32, vl); \ - v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2] - 32, vl); \ - v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3] + 32, vl); \ +#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ } // Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 #define READYUV422(vl, v_u, v_v, v_y_16) \ { \ - vuint8mf2_t v_tmp0, v_tmp1; \ - vuint8m1_t v_y; \ - vuint16m1_t v_u_16, v_v_16; \ - vl = __riscv_vsetvl_e8mf2((w + 1) / 2); \ - v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl); \ - v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl); \ - v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl); \ - v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl); \ - v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \ - v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \ - v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16); \ - v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16); \ - vl = __riscv_vsetvl_e8m1(w); \ - v_y = __riscv_vle8_v_u8m1(src_y, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 #define READYUV444(vl, v_u, v_v, v_y_16) \ { \ - vuint8m1_t v_y; \ - vl = __riscv_vsetvl_e8m1(w); \ - v_y = __riscv_vle8_v_u8m1(src_y, vl); \ - v_u = __riscv_vle8_v_u8m1(src_u, vl); \ - v_v = __riscv_vle8_v_u8m1(src_v, vl); \ - v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } // Convert from YUV to fixed point RGB -#define YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, \ - v_y_16, v_g_16, v_b_16, v_r_16) \ +#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \ + v_b_16, v_r_16) \ { \ - vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ - vuint32m4_t v_tmp5; \ - v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl); \ - v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl); \ - v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl); \ - v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl); \ - v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl); \ - v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl); \ - v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl); \ - v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl); \ - v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl); \ - v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl); \ - v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl); \ - v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl); \ + vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ + vuint32m8_t v_tmp5; \ + v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \ + v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \ + v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \ + v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \ + v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \ + v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \ + v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \ + v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \ + v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \ + v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \ + v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \ + v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ } // Convert from fixed point RGB To 8 bit RGB #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ { \ - v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl); \ - v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl); \ - v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl); \ + v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \ + v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ + v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -270,20 +268,19 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV444(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_u += vl; @@ -301,20 +298,19 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV444(vl, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m1(src_a, vl); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_a += vl; @@ -332,19 +328,18 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV444(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl; @@ -361,20 +356,19 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_u += vl / 2; @@ -392,20 +386,19 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV422(vl, v_u, v_v, v_y_16); - v_a = __riscv_vle8_v_u8m1(src_a, vl); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); w -= vl; src_y += vl; src_a += vl; @@ -423,20 +416,19 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r, v_a; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); - v_a = __riscv_vmv_v_x_u8m1(255u, vl); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl / 2; @@ -453,19 +445,18 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, int width) { size_t vl; size_t w = (size_t)width; - vuint8m1_t v_u, v_v; - vuint8m1_t v_ub, v_vr, v_ug, v_vg; - vuint8m1_t v_b, v_g, v_r; - vuint16m2_t v_yg, v_bb, v_bg, v_br; - vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; - YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, - v_br); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); do { READYUV422(vl, v_u, v_v, v_y_16); - YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, - v_y_16, v_g_16, v_b_16, v_r_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); - __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); w -= vl; src_y += vl; src_u += vl / 2; @@ -528,6 +519,75 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { } while (w > 0); } +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl); + __riscv_vse8_v_u8m8(dst, v_data, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} + +// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + size_t dst_w = (size_t)dst_width; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + // Blend 100 / 0 - Copy row unchanged. + if (y1_fraction == 0) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); + dst_w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // Blend 50 / 50. + if (y1_fraction == 128) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); + vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); + // Averaging add + vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); + __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // General purpose row blend. + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + size_t vl = __riscv_vsetvl_e8m4(dst_w); + vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); + vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); + acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); +} + void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -660,6 +720,42 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, } while (w > 0); } +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} + +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} + struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc deleted file mode 100644 index b414d2d0e..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/row_win.cc +++ /dev/null @@ -1,6440 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -// This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) - -#if defined(_M_ARM64EC) -#include -#elif defined(_M_X64) -#include -#include // For _mm_maddubs_epi16 -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// 64 bit -#if defined(_M_X64) - -// Read 8 UV from 444 -#define READYUV444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 8 UV from 444, With 8 Alpha. -#define READYUVA444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ - xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ - xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ - xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ - xmm0 = _mm_adds_epi16(xmm4, xmm0); \ - xmm1 = _mm_subs_epi16(xmm4, xmm1); \ - xmm2 = _mm_adds_epi16(xmm4, xmm2); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); - -// Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ - dst_argb += 32; - -#if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -// 32 bit -#else // defined(_M_X64) -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Constants for ARGB. -static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; - -// JPeg full range. -static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0}; - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; - -static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; - -static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; - -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; - -// Constants for BGRA. -static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, - 0, 33, 65, 13, 0, 33, 65, 13}; - -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - -// Constants for ABGR. -static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, - 33, 65, 13, 0, 33, 65, 13, 0}; - -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - -// Constants for RGBA. -static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, - 0, 13, 65, 33, 0, 13, 65, 33}; - -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - -static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; - -// 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; - -// 8 bit fixed point 0.5, for bias of UV. -static const ulvec8 kBiasUV128 = { - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Shuffle table for converting RGB24 to ARGB. -static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; - -// Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; - -// Shuffle table for converting RAW to RGB24. First 8. -static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Middle 8. -static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Last 8. -static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RGB24. -static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RAW. -static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; - -// YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, - 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, - 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - -// YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, - 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, - 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; - -// UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, - 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, - 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - -// UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, - 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, - 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; - -// NV21 shuf 8 VU to 16 UV. -static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, -}; - -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_J400TOARGBROW_AVX2 -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 - vpunpcklbw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - vpunpckhwd ymm1, ymm0, ymm0 - vpunpcklwd ymm0, ymm0, ymm0 - vpor ymm0, ymm0, ymm5 - vpor ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_J400TOARGBROW_AVX2 - -__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 - mov ecx, [esp + 12] // width - movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 - movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 4] - movdqu xmm2, [eax + 8] - lea eax, [eax + 24] - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 8 - jg convertloop - ret - } -} - -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -// 20 instructions. -__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green - psllw xmm4, 10 - psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_RGB565TOARGBROW_AVX2 -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green - vpsllw ymm4, ymm4, 10 - vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_RGB565TOARGBROW_AVX2 - -#ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) - vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB1555TOARGBROW_AVX2 - -#ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles - vpsrlw ymm3, ymm2, 4 - vpsllw ymm1, ymm0, 4 - vpor ymm2, ymm2, ymm3 - vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm2, ymm2, 0xd8 - vpunpckhbw ymm1, ymm0, ymm2 - vpunpcklbw ymm0, ymm0, ymm2 - vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB4444TOARGBROW_AVX2 - -// 24 instructions -__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green - psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) - pand xmm2, xmm7 - por xmm0, xmm2 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -// 18 instructions. -__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - movd xmm4, eax - pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles - pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 - movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - psllw xmm1, 4 - psrlw xmm3, 4 - por xmm0, xmm1 - por xmm2, xmm3 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - __asm { - - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes - movdqa xmm7, xmm6 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes - vpermq ymm6, ymm6, 0xd8 - vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565DITHERROW_AVX2 - -// TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f - psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 - pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 - pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 - pslld xmm7, 15 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 - psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 - psrlw xmm3, 8 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble - psrld xmm0, 4 - psrld xmm1, 8 - por xmm0, xmm1 - packuswb xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565ROW_AVX2 - -#ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 - vpslld ymm7, ymm7, 15 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA - vpackssdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB1555ROW_AVX2 - -#ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 - vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble - vpsrld ymm1, ymm1, 8 - vpsrld ymm0, ymm0, 4 - vpor ymm0, ymm0, ymm1 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB4444ROW_AVX2 - -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 // Add .5 for rounding. - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTOYROW_AVX2 -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; - -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToY - vbroadcastf128 ymm5, xmmword ptr kAddY16 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 // add 16 for Y - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYROW_AVX2 - -#ifdef HAS_ARGBTOYJROW_AVX2 -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToYJ - vbroadcastf128 ymm5, xmmword ptr kAddYJ64 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. - vpaddw ymm2, ymm2, ymm5 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYJROW_AVX2 - -__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kBGRAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kABGRToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kRGBAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToVJ - movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToV - vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - vpaddb ymm0, ymm0, ymm5 // -> unsigned - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToVJ - vbroadcastf128 ymm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned - vpaddw ymm0, ymm0, ymm5 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVJROW_AVX2 - -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - - movdqu xmm0, [eax] // V - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqu [edx + edi], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kBGRAToV - movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kABGRToV - movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kRGBAToV - movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBTOYROW_SSSE3 - -// Read 16 UV from 444 -#define READYUV444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 16 UV from 444. With 16 Alpha. -#define READYUVA444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32]} - -// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32]} - -// Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ - __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ - __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ - __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ - __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ - __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ - __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ - __asm vpaddw ymm4, ymm3, ymm4 \ - __asm vpaddsw ymm0, ymm0, ymm4 \ - __asm vpsubsw ymm1, ymm4, ymm1 \ - __asm vpaddsw ymm2, ymm2, ymm4 \ - __asm vpsraw ymm0, ymm0, 6 \ - __asm vpsraw ymm1, ymm1, 6 \ - __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 \ - __asm vpackuswb ymm1, ymm1, ymm1 \ - __asm vpackuswb ymm2, ymm2, ymm2} - -// Store 16 ARGB values. -#define STOREARGB_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ - __asm vmovdqu 0[edx], ymm1 \ - __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64]} - -// Store 16 RGBA values. -#define STORERGBA_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ - __asm vmovdqu [edx], ymm0 \ - __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64]} - -#ifdef HAS_I422TOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I422ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TOARGBROW_AVX2 - -#ifdef HAS_I422ALPHATOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) void I422AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422ALPHATOARGBROW_AVX2 - -#ifdef HAS_I444TOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - convertloop: - READYUV444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444TOARGBROW_AVX2 - -#ifdef HAS_I444ALPHATOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - convertloop: - READYUVA444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444AlphaTOARGBROW_AVX2 - -#ifdef HAS_NV12TOARGBROW_AVX2 -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV12ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV12_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV12TOARGBROW_AVX2 - -#ifdef HAS_NV21TOARGBROW_AVX2 -// 16 pixels. -// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV21ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV21_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV21TOARGBROW_AVX2 - -#ifdef HAS_YUY2TOARGBROW_AVX2 -// 16 pixels. -// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void YUY2ToARGBRow_AVX2( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUY2_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_YUY2TOARGBROW_AVX2 - -#ifdef HAS_UYVYTOARGBROW_AVX2 -// 16 pixels. -// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void UYVYToARGBRow_AVX2( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READUYVY_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_UYVYTOARGBROW_AVX2 - -#ifdef HAS_I422TORGBAROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) void I422ToRGBARow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STORERGBA_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TORGBAROW_AVX2 - -#if defined(HAS_I422TOARGBROW_SSSE3) -// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. -// Allows a conversion with half size scaling. - -// Read 8 UV from 444. -#define READYUV444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 444. With 8 Alpha. -#define READYUVA444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 \ - __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ - __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16]} - -// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY \ - __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ - __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16]} - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) \ - __asm { \ - __asm psubb xmm3, xmmword ptr kBiasUV128 \ - __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ - __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ - __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ - __asm pmaddubsw xmm0, xmm3 \ - __asm pmaddubsw xmm1, xmm3 \ - __asm pmaddubsw xmm2, xmm3 \ - __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ - __asm paddw xmm4, xmm3 \ - __asm paddsw xmm0, xmm4 \ - __asm paddsw xmm2, xmm4 \ - __asm psubsw xmm4, xmm1 \ - __asm movdqa xmm1, xmm4 \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ - } - -// Store 8 ARGB values. -#define STOREARGB \ - __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm0 \ - __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32]} - -// Store 8 BGRA values. -#define STOREBGRA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGBA values. -#define STORERGBA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGB24 values. -#define STORERGB24 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24]} - -// Store 8 RGB565 values. -#define STORERGB565 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ - __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16]} - -// 8 pixels. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I444ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). -__declspec(naked) void I444AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) void I422ToRGB24Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB24 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) void I444ToRGB24Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - READYUV444 - YUVTORGB(ebx) - STORERGB24 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) void I422ToRGB565Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f - psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 - psrld xmm6, 26 - pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 - pslld xmm7, 11 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB565 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I422ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) void I422AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV12ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV12 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV21ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV21 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void YUY2ToARGBRow_SSSE3( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUY2 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -// 8 pixels. -// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void UYVYToARGBRow_SSSE3( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READUYVY - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -__declspec(naked) void I422ToRGBARow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGBA - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} -#endif // HAS_I422TOARGBROW_SSSE3 - -// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter -#ifdef HAS_I400TOARGBROW_SSE2 -// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - movd xmm2, eax - pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - movd xmm3, eax - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y - pmulhuw xmm0, xmm2 - psubusw xmm0, xmm3 - psrlw xmm0, 6 - packuswb xmm0, xmm0 // G - - // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels - por xmm0, xmm4 - por xmm1, xmm4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_I400TOARGBROW_SSE2 - -#ifdef HAS_I400TOARGBROW_AVX2 -// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). -// note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - vmovd xmm2, eax - vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - vmovd xmm3, eax - vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 - vpslld ymm4, ymm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y - vpmulhuw ymm0, ymm0, ymm2 - vpsubusw ymm0, ymm0, ymm3 - vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates - vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels - vpor ymm0, ymm0, ymm4 - vpor ymm1, ymm1, ymm4 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_I400TOARGBROW_AVX2 - -#ifdef HAS_MIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -// TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - movdqa xmm5, xmmword ptr kShuffleMirror - - convertloop: - movdqu xmm0, [eax - 16 + ecx] - pshufb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} -#endif // HAS_MIRRORROW_SSSE3 - -#ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vbroadcastf128 ymm5, xmmword ptr kShuffleMirror - - convertloop: - vmovdqu ymm0, [eax - 32 + ecx] - vpshufb ymm0, ymm0, ymm5 - vpermq ymm0, ymm0, 0x4e // swap high and low halfs - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_MIRRORROW_AVX2 - -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - -__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm1, xmmword ptr kShuffleMirrorUV - lea eax, [eax + ecx * 2 - 16] - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm1 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [edx + edi], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MIRRORSPLITUVROW_SSSE3 - -#ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - lea eax, [eax - 16 + ecx * 4] // last 4 pixels. - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufd xmm0, xmm0, 0x1b - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBMIRRORROW_SSE2 - -#ifdef HAS_ARGBMIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 - - convertloop: - vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBMIRRORROW_AVX2 - -#ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [edx], xmm0 - movdqu [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -#endif // HAS_SPLITUVROW_SSE2 - -#ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes - vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpackuswb ymm2, ymm2, ymm3 - vpermq ymm0, ymm0, 0xd8 - vpermq ymm2, ymm2, 0xd8 - vmovdqu [edx], ymm0 - vmovdqu [edx + edi], ymm2 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_SPLITUVROW_AVX2 - -#ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // read 16 U's - movdqu xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqu [edi], xmm0 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MERGEUVROW_SSE2 - -#ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - vpmovzxbw ymm0, [eax] - vpmovzxbw ymm1, [eax + edx] - lea eax, [eax + 16] - vpsllw ymm1, ymm1, 8 - vpor ymm2, ymm1, ymm0 - vmovdqu [edi], ymm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_MERGEUVROW_AVX2 - -#ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) void CopyRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - test eax, 15 - jne convertloopu - test edx, 15 - jne convertloopu - - convertloopa: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopa - ret - - convertloopu: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopu - ret - } -} -#endif // HAS_COPYROW_SSE2 - -#ifdef HAS_COPYROW_AVX -// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) void CopyRow_AVX(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 64 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_COPYROW_AVX - -// Multiple of 1. -__declspec(naked) void CopyRow_ERMS(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // width - rep movsb - mov edi, edx - mov esi, eax - ret - } -} - -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movdqu xmm2, [eax] - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + 32] - lea eax, [eax + 64] - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_AVX2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - - extractloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrld xmm0, 24 - psrld xmm1, 24 - packssdw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg extractloop - - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX - - extractloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpsrld ymm0, ymm0, 24 - vpsrld ymm1, ymm1, 24 - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - lea eax, [eax + 128] - vpackssdw ymm0, ymm0, ymm1 // mutates - vpsrld ymm2, ymm2, 24 - vpsrld ymm3, ymm3, 24 - vpackssdw ymm2, ymm2, ymm3 // mutates - vpackuswb ymm0, ymm0, ymm2 // mutates - vpermd ymm0, ymm4, ymm0 // unmutate - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg extractloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movq xmm2, qword ptr [eax] // 8 Y's - lea eax, [eax + 8] - punpcklbw xmm2, xmm2 - punpckhwd xmm3, xmm2 - punpcklwd xmm2, xmm2 - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vpmovzxbd ymm1, qword ptr [eax] - vpmovzxbd ymm2, qword ptr [eax + 8] - lea eax, [eax + 16] - vpslld ymm1, ymm1, 24 - vpslld ymm2, ymm2, 24 - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 - -#ifdef HAS_SETROW_X86 -// Write 'width' bytes using an 8 bit value repeated. -// width should be multiple of 4. -__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { - __asm { - movzx eax, byte ptr [esp + 8] // v8 - mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. - mov edx, edi - mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // width - shr ecx, 2 - rep stosd - mov edi, edx - ret - } -} - -// Write 'width' bytes using an 8 bit value repeated. -__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // width - rep stosb - mov edi, edx - ret - } -} - -// Write 'width' 32 bit values. -__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, - uint32_t v32, - int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // width - rep stosd - mov edi, edx - ret - } -} -#endif // HAS_SETROW_X86 - -#ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_YUY2TOYROW_AVX2 - -#ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_YUY2TOYROW_SSE2 - -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm6, eax - pshufd xmm6, xmm6, 0x00 - - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - movd xmm7, eax - pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 8 pixel loop. - convertloop8: - movq xmm0, qword ptr [esi] // alpha - punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a - movq xmm1, qword ptr [eax + esi] // src0 - movq xmm2, qword ptr [edx + esi] // src1 - punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 - pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edi + esi], xmm0 - lea esi, [esi + 8] - sub ecx, 8 - jg convertloop8 - - pop edi - pop esi - ret - } -} -#endif // HAS_BLENDPLANEROW_SSSE3 - -#ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 32 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 - vpsllw ymm5, ymm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - vmovd xmm7, eax - vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 32 pixel loop. - convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a - vmovdqu ymm1, [eax + esi] // src0 - vmovdqu ymm2, [edx + esi] // src1 - vpunpckhbw ymm4, ymm1, ymm2 - vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 - vpmaddubsw ymm3, ymm3, ymm4 - vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. - vpsrlw ymm3, ymm3, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm3 - vmovdqu [edi + esi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg convertloop32 - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_BLENDPLANEROW_AVX2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; - -// Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - sub ecx, 4 - jl convertloop4b // less than 4 pixels? - - // 4 pixel loop. - convertloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop4b: - add ecx, 4 - 1 - jl convertloop1b - - // 1 pixel loop. - convertloop1: - movd xmm3, [eax] // src argb - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop1b: - pop esi - ret - } -} -#endif // HAS_ARGBBLENDROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, -}; -static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, -}; -__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 - pslld xmm3, 24 - movdqa xmm4, xmmword ptr kShuffleAlpha0 - movdqa xmm5, xmmword ptr kShuffleAlpha1 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha - lea eax, [eax + 16] - pand xmm2, xmm3 - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBATTENUATEROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; -__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpshufb ymm2, ymm0, ymm4 // low 4 alphas - vpshufb ymm3, ymm1, ymm4 // high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * a - vpmulhuw ymm1, ymm1, ymm3 // rgb * a - vpand ymm6, ymm6, ymm5 // isolate alpha - vpsrlw ymm0, ymm0, 8 - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vpor ymm0, ymm0, ymm6 // copy original alpha - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - lea ebx, fixed_invtbl8 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 3] // first alpha - movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a - - movdqu xmm1, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 11] // third alpha - movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a - lea eax, [eax + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 - -#ifdef HAS_ARGBUNATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; -// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. -// USE_GATHER is not on by default, due to being a slow instruction. -#ifdef USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#else // USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - sub edx, eax - lea ebx, fixed_invtbl8 - vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] - // end of VPGATHER - - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - vzeroupper - ret - } -} -#endif // USE_GATHER -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // Add .5 for rounding. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes - movdqu xmm2, [eax] // A - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - psrld xmm2, 24 - psrld xmm3, 24 - packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -// Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, - 17, 68, 35, 0, 17, 68, 35, 0}; - -static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, - 22, 88, 45, 0, 22, 88, 45, 0}; - -static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, - 24, 98, 50, 0, 24, 98, 50, 0}; - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ - movdqa xmm2, xmmword ptr kARGBToSepiaB - movdqa xmm3, xmmword ptr kARGBToSepiaG - movdqa xmm4, xmmword ptr kARGBToSepiaR - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm6, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm6, xmm2 - phaddw xmm0, xmm6 - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values - movdqu xmm5, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm3 - pmaddubsw xmm1, xmm3 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values - movdqu xmm5, [eax] // R - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values - movdqu xmm6, [eax] // A - movdqu xmm1, [eax + 16] - psrld xmm6, 24 - psrld xmm1, 24 - packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 - movdqu [eax], xmm0 - movdqu [eax + 16], xmm1 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBSEPIAROW_SSSE3 - -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R -// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ - movdqu xmm5, [ecx] - pshufd xmm2, xmm5, 0x00 - pshufd xmm3, xmm5, 0x55 - pshufd xmm4, xmm5, 0xaa - pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm7, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm7, xmm2 - movdqu xmm6, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm6, xmm3 - pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values - movdqu xmm1, [eax] // R - movdqu xmm7, [eax + 16] - pmaddubsw xmm1, xmm4 - pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R - movdqu xmm6, [eax] // A - movdqu xmm7, [eax + 16] - pmaddubsw xmm6, xmm5 - pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm6 - lea eax, [eax + 32] - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 - -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ - pshuflw xmm2, xmm2, 040h - pshufd xmm2, xmm2, 044h - pshuflw xmm3, xmm3, 040h - pshufd xmm3, xmm3, 044h - pshuflw xmm4, xmm4, 040h - pshufd xmm4, xmm4, 044h - pxor xmm5, xmm5 // constant 0 - pcmpeqb xmm6, xmm6 // generate mask 0xff000000 - pslld xmm6, 24 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels - pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size - movdqu xmm7, [eax] // read 4 pixels - pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 - paddw xmm1, xmm4 - packuswb xmm0, xmm1 - por xmm0, xmm7 - movdqu [eax], xmm0 - lea eax, [eax + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - movd xmm2, [esp + 16] // value - punpcklbw xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pxor xmm5, xmm5 // constant 0 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - movdqu xmm2, [esi] // read 4 pixels from src_argb1 - movdqu xmm1, xmm0 - movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 - lea eax, [eax + 16] - lea esi, [esi + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 - -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -// TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - sub ecx, 4 - jl convertloop49 - - convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop49: - add ecx, 4 - 1 - jl convertloop19 - - convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb - lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 - lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop19: - pop esi - ret - } -} -#endif // HAS_ARGBADDROW_SSE2 - -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb - src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 - - convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 - lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 - vpackuswb ymm0, ymm0, ymm1 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_AVX2 - -#ifdef HAS_ARGBADDROW_AVX2 -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBADDROW_AVX2 - -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_AVX2 - -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 - mov edi, [esp + 8 + 12] // src_y2 - mov edx, [esp + 8 + 16] // dst_sobelx - mov ecx, [esp + 8 + 20] // width - sub esi, eax - sub edi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] - movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_SOBELXROW_SSE2 - -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 - mov edx, [esp + 4 + 12] // dst_sobely - mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] - movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELYROW_SSE2 - -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA - por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA - por xmm0, xmm5 - movdqu [edx], xmm1 - movdqu [edx + 16], xmm2 - movdqu [edx + 32], xmm3 - movdqu [edx + 48], xmm0 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELROW_SSE2 - -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA - punpcklbw xmm3, xmm5 - punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS - punpcklbw xmm4, xmm2 - punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 - movdqu [edx], xmm6 - movdqu [edx + 16], xmm4 - movdqu [edx + 32], xmm7 - movdqu [edx + 48], xmm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -// Consider float CumulativeSum. -// Consider calling CumulativeSum one row at time as needed. -// Consider circular CumulativeSum buffer of radius * 2 + 1 height. -// Convert cumulative sum for an area to an average for 1 pixel. -// topleft is pointer to top left of CumulativeSum buffer for area. -// botleft is pointer to bottom left of CumulativeSum buffer. -// width is offset from left to right of area in CumulativeSum buffer measured -// in number of ints. -// area is the number of pixels in the area being averaged. -// dst points to pixel to store result to. -// count is number of averaged pixels to produce. -// Does 4 pixels at a time. -// This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - __asm { - mov eax, topleft // eax topleft - mov esi, botleft // esi botleft - mov edx, width - movd xmm5, area - mov edi, dst - mov ecx, count - cvtdq2ps xmm5, xmm5 - rcpss xmm4, xmm5 // 1.0f / area - pshufd xmm4, xmm4, 0 - sub ecx, 4 - jl l4b - - cmp area, 128 // 128 pixels will not overflow 15 bits. - ja l4 - - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 - psrld xmm6, 16 - cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts - - // 4 pixel loop small blocks. - s4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - packssdw xmm0, xmm1 // pack 4 pixels into 2 registers - packssdw xmm2, xmm3 - - pmulhuw xmm0, xmm5 - pmulhuw xmm2, xmm5 - - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge s4 - - jmp l4b - - // 4 pixel loop - l4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area - cvtdq2ps xmm1, xmm1 - mulps xmm0, xmm4 - mulps xmm1, xmm4 - cvtdq2ps xmm2, xmm2 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - cvtps2dq xmm0, xmm0 - cvtps2dq xmm1, xmm1 - cvtps2dq xmm2, xmm2 - cvtps2dq xmm3, xmm3 - packssdw xmm0, xmm1 - packssdw xmm2, xmm3 - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movdqu xmm0, [eax] - psubd xmm0, [eax + edx * 4] - lea eax, [eax + 16] - psubd xmm0, [esi] - paddd xmm0, [esi + edx * 4] - lea esi, [esi + 16] - cvtdq2ps xmm0, xmm0 - mulps xmm0, xmm4 - cvtps2dq xmm0, xmm0 - packssdw xmm0, xmm0 - packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] - sub ecx, 1 - jge l1 - l1b: - } -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - __asm { - mov eax, row - mov edx, cumsum - mov esi, previous_cumsum - mov ecx, width - pxor xmm0, xmm0 - pxor xmm1, xmm1 - - sub ecx, 4 - jl l4b - test edx, 15 - jne l4b - - // 4 pixel loop - l4: - movdqu xmm2, [eax] // 4 argb pixels 16 bytes. - lea eax, [eax + 16] - movdqa xmm4, xmm2 - - punpcklbw xmm2, xmm1 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm1 - punpckhwd xmm3, xmm1 - - punpckhbw xmm4, xmm1 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm1 - punpckhwd xmm5, xmm1 - - paddd xmm0, xmm2 - movdqu xmm2, [esi] // previous row above. - paddd xmm2, xmm0 - - paddd xmm0, xmm3 - movdqu xmm3, [esi + 16] - paddd xmm3, xmm0 - - paddd xmm0, xmm4 - movdqu xmm4, [esi + 32] - paddd xmm4, xmm0 - - paddd xmm0, xmm5 - movdqu xmm5, [esi + 48] - lea esi, [esi + 64] - paddd xmm5, xmm0 - - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - movdqu [edx + 32], xmm4 - movdqu [edx + 48], xmm5 - - lea edx, [edx + 64] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movd xmm2, dword ptr [eax] // 1 argb pixel - lea eax, [eax + 4] - punpcklbw xmm2, xmm1 - punpcklwd xmm2, xmm1 - paddd xmm0, xmm2 - movdqu xmm2, [esi] - lea esi, [esi + 16] - paddd xmm2, xmm0 - movdqu [edx], xmm2 - lea edx, [edx + 16] - sub ecx, 1 - jge l1 - - l1b: - } -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 -// Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 12] // src_argb - mov esi, [esp + 16] // stride - mov edx, [esp + 20] // dst_argb - mov ecx, [esp + 24] // pointer to uv_dudv - movq xmm2, qword ptr [ecx] // uv - movq xmm7, qword ptr [ecx + 8] // dudv - mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride - add esi, 4 - movd xmm5, esi - sub ecx, 4 - jl l4b - - // setup for 4 pixel loop - pshufd xmm7, xmm7, 0x44 // dup dudv - pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 - addps xmm0, xmm7 - movlhps xmm2, xmm0 - movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 - addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 - - // 4 pixel loop - l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd xmm1, [eax + esi] // read pixel 0 - movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 - movq qword ptr [edx], xmm1 - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - movd xmm6, [eax + esi] // read pixel 2 - movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 - movq qword ptr 8[edx], xmm6 - lea edx, [edx + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy - movd esi, xmm0 - movd xmm0, [eax + esi] // copy a pixel - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge l1 - l1b: - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBAFFINEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_AVX2 -// Bilinear filter 32x2 -> 32x1 -__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - sub edi, esi - cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. - - vmovd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - vmovd xmm5, eax // low fraction 256..1 - vpunpcklbw xmm5, xmm5, xmm0 - vpunpcklwd xmm5, xmm5, xmm5 - vbroadcastss ymm5, xmm5 - - mov eax, 0x80808080 // 128b for bias and rounding. - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - - xloop: - vmovdqu ymm0, [esi] - vmovdqu ymm2, [esi + edx] - vpunpckhbw ymm1, ymm0, ymm2 // mutates - vpunpcklbw ymm0, ymm0, ymm2 - vpsubb ymm1, ymm1, ymm4 // bias to signed image - vpsubb ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm5, ymm1 - vpmaddubsw ymm0, ymm5, ymm0 - vpaddw ymm1, ymm1, ymm4 // unbias and round - vpaddw ymm0, ymm0, ymm4 - vpsrlw ymm1, ymm1, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - vmovdqu ymm0, [esi] - vpavgb ymm0, ymm0, [esi + edx] - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - rep movsb - - xloop99: - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_INTERPOLATEROW_AVX2 - -// Bilinear filter 16x2 -> 16x1 -// TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 /256. Blend 100 / 0. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - - movd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - movd xmm5, eax // low fraction 255..1 - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm4, eax - pshufd xmm4, xmm4, 0x00 - - xloop: - movdqu xmm0, [esi] - movdqu xmm2, [esi + edx] - movdqu xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 - psubb xmm1, xmm4 - movdqa xmm2, xmm5 - movdqa xmm3, xmm5 - pmaddubsw xmm2, xmm0 - pmaddubsw xmm3, xmm1 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - psrlw xmm2, 8 - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [esi + edi], xmm2 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - movdqu xmm0, [esi] - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - ret - } -} - -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpshufb ymm0, ymm0, ymm5 - vpshufb ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBSHUFFLEROW_AVX2 - -// YUY2 - Macro-pixel = 2 image pixels -// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... - -// UYVY - Macro-pixel = 2 image pixels -// U0Y0V0Y1 - -__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV - punpckhbw xmm1, xmm2 - movdqu [edi], xmm0 - movdqu [edi + 16], xmm1 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - movdqa xmm1, xmm2 - lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY - punpckhbw xmm2, xmm0 - movdqu [edi], xmm1 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ - pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - - // 2 pixel loop. - convertloop: - // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel - // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel - movq xmm0, qword ptr [eax] // BGRABGRA - lea eax, [eax + 8] - punpcklbw xmm0, xmm3 - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm3 // pixel 0 - punpckhwd xmm4, xmm3 // pixel 1 - cvtdq2ps xmm0, xmm0 // 4 floats - cvtdq2ps xmm4, xmm4 - movdqa xmm1, xmm0 // X - movdqa xmm5, xmm4 - mulps xmm0, [esi + 16] // C1 * X - mulps xmm4, [esi + 16] - addps xmm0, [esi] // result = C0 + C1 * X - addps xmm4, [esi] - movdqa xmm2, xmm1 - movdqa xmm6, xmm5 - mulps xmm2, xmm1 // X * X - mulps xmm6, xmm5 - mulps xmm1, xmm2 // X * X * X - mulps xmm5, xmm6 - mulps xmm2, [esi + 32] // C2 * X * X - mulps xmm6, [esi + 32] - mulps xmm1, [esi + 48] // C3 * X * X * X - mulps xmm5, [esi + 48] - addps xmm0, xmm2 // result += C2 * X * X - addps xmm4, xmm6 - addps xmm0, xmm1 // result += C3 * X * X * X - addps xmm4, xmm5 - cvttps2dq xmm0, xmm0 - cvttps2dq xmm4, xmm4 - packuswb xmm0, xmm4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 - -#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 - vbroadcastf128 ymm5, [ecx + 16] // C1 - vbroadcastf128 ymm6, [ecx + 32] // C2 - vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ - - // 2 pixel loop. - convertloop: - vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels - lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats - vmulps ymm2, ymm0, ymm0 // X * X - vmulps ymm3, ymm0, ymm7 // C3 * X - vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X - vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X - vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X - vcvttps2dq ymm0, ymm0 - vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 - vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 - vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 - vmovq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_AVX2 - -#ifdef HAS_HALFFLOATROW_SSE2 -static float kExpBias = 1.9259299444e-34f; -__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - mulss xmm4, kExpBias - pshufd xmm4, xmm4, 0 - pxor xmm5, xmm5 - sub edx, eax - - // 8 pixel loop. - convertloop: - movdqu xmm2, xmmword ptr [eax] // 8 shorts - add eax, 16 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm5 - cvtdq2ps xmm2, xmm2 // convert 8 ints to floats - punpckhwd xmm3, xmm5 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - psrld xmm2, 13 - psrld xmm3, 13 - packssdw xmm2, xmm3 - movdqu [eax + edx - 16], xmm2 - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_HALFFLOATROW_SSE2 - -#ifdef HAS_HALFFLOATROW_AVX2 -__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - - vmulss xmm4, xmm4, kExpBias - vbroadcastss ymm4, xmm4 - vpxor ymm5, ymm5, ymm5 - sub edx, eax - - // 16 pixel loop. - convertloop: - vmovdqu ymm2, [eax] // 16 shorts - add eax, 32 - vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints - vpunpcklwd ymm2, ymm2, ymm5 - vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats - vcvtdq2ps ymm2, ymm2 - vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. - vmulps ymm2, ymm2, ymm4 - vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate - vpsrld ymm2, ymm2, 13 - vpackssdw ymm2, ymm2, ymm3 - vmovdqu [eax + edx - 32], ymm2 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_AVX2 - -#ifdef HAS_HALFFLOATROW_F16C -__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - vbroadcastss ymm4, [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - sub edx, eax - - // 16 pixel loop. - convertloop: - vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints - vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts - add eax, 32 - vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats - vcvtdq2ps ymm3, ymm3 - vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 - vmulps ymm3, ymm3, ymm4 - vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate - vcvtps2ph xmm3, ymm3, 3 - vmovdqu [eax + edx + 32], xmm2 - vmovdqu [eax + edx + 32 + 16], xmm3 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_F16C - -#ifdef HAS_ARGBCOLORTABLEROW_X86 -// Tranform ARGB pixels with color table. -__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - movzx edx, byte ptr [eax - 4 + 3] - movzx edx, byte ptr [esi + edx * 4 + 3] - mov byte ptr [eax - 4 + 3], dl - dec ecx - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - -#ifdef HAS_RGBCOLORTABLEROW_X86 -// Tranform RGB pixels with color table. -__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - dec ecx - jg convertloop - - pop esi - ret - } -} -#endif // HAS_RGBCOLORTABLEROW_X86 - -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ - movd xmm2, dword ptr [esp + 8 + 16] // luma table - movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff - pshufd xmm2, xmm2, 0 - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 - psllw xmm4, 8 - pxor xmm5, xmm5 - - // 4 pixel loop. - convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr - pmaddubsw xmm0, xmm3 - phaddw xmm0, xmm0 - pand xmm0, xmm4 // mask out low bits - punpcklwd xmm0, xmm5 - paddd xmm0, xmm2 // add table base - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi], dl - movzx edx, byte ptr [eax + 1] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 1], dl - movzx edx, byte ptr [eax + 2] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 2], dl - movzx edx, byte ptr [eax + 3] // copy alpha. - mov byte ptr [edi + 3], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 4] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 4], dl - movzx edx, byte ptr [eax + 5] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 5], dl - movzx edx, byte ptr [eax + 6] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 6], dl - movzx edx, byte ptr [eax + 7] // copy alpha. - mov byte ptr [edi + 7], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 8] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 8], dl - movzx edx, byte ptr [eax + 9] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 9], dl - movzx edx, byte ptr [eax + 10] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 10], dl - movzx edx, byte ptr [eax + 11] // copy alpha. - mov byte ptr [edi + 11], dl - - movd esi, xmm0 - - movzx edx, byte ptr [eax + 12] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 12], dl - movzx edx, byte ptr [eax + 13] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 13], dl - movzx edx, byte ptr [eax + 14] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 14], dl - movzx edx, byte ptr [eax + 15] // copy alpha. - mov byte ptr [edi + 15], dl - - lea eax, [eax + 16] - lea edi, [edi + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 - -#endif // defined(_M_X64) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc index 591a6a938..80b030dc2 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc @@ -1118,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1313,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc index 8d2509474..ddd8d29ed 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_argb.cc @@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; @@ -467,6 +472,11 @@ static void ScaleARGBBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleARGBFilterCols = @@ -724,6 +734,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc index 5e603fd40..774559032 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_common.cc @@ -1678,6 +1678,12 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + for (j = 0; j < dst_height; ++j) { int yi; int yf; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc deleted file mode 100644 index 1226ef3ea..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale_mmi.cc +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyARGB -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -// CPU agnostic row functions -void ScaleRowDown2_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlh %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest0, dest1; - - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "and %[dest0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "and %[dest1], %[src1], %[mask] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - - "psrlh %[src0], %[src0], %[shift] \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - "packushb %[dest1], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - - uint64_t s0, s1, t0, t1; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift0 = 0x2ULL; - const uint64_t shift1 = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest0], %[s0], %[s1] \n\t" - "paddh %[dest0], %[dest0], %[t0] \n\t" - "paddh %[dest0], %[dest0], %[t1] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift0] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest1], %[s0], %[s1] \n\t" - "paddh %[dest1], %[dest1], %[t0] \n\t" - "paddh %[dest1], %[dest1], %[t1] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift0] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" - "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" - "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - - uint64_t s0, s_hi, s_lo; - uint64_t t0, t_hi, t_lo; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shfit = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" - - "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" - - "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), - [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) - : "memory"); -} - -void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x10ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - - "packsswh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - - "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" - "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" - - "pavgh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - - uint64_t s0, s1, s_hi, s_lo; - uint64_t t0, t1, t_hi, t_lo; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0000000200000002ULL; - const uint64_t mask = 0x0000ffff0000ffffULL; - const uint64_t shift0 = 0x10ULL; - const uint64_t shift1 = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest0], %[s0], %[s1] \n\t" - "paddw %[dest0], %[dest0], %[t0] \n\t" - "paddw %[dest0], %[dest0], %[t1] \n\t" - "paddw %[dest0], %[dest0], %[ph] \n\t" - "psrlw %[dest0], %[dest0], %[shift1] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest1], %[s0], %[s1] \n\t" - "paddw %[dest1], %[dest1], %[t0] \n\t" - "paddw %[dest1], %[dest1], %[t1] \n\t" - "paddw %[dest1], %[dest1], %[ph] \n\t" - "psrlw %[dest1], %[dest1], %[shift1] \n\t" - - "packsswh %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), - [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t shift = 0x10ULL; - const uint64_t mask = 0x000000ff000000ffULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_lo], %[src0], %[src1] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_hi], %[src0], %[src1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift), [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" - "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [mask] "f"(mask) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ - "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ - "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ - "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ - "paddh " #reg ", " #reg ", %[ph] \n\t" \ - "psrlh " #reg ", " #reg ", %[shift] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box */ -void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* src0_ptr = src_ptr; - const uint8_t* src1_ptr = src_ptr + src_stride; - const uint8_t* src2_ptr = src_ptr + src_stride * 2; - const uint8_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x0001000100010001ULL; - const uint64_t ph = 0x0008000800080008ULL; - const uint64_t shift = 0x4ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) - - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ - "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ - "paddw %[dest], %[dest_hi], %[dest] \n\t" \ - "paddw %[dest], %[dest], %[ph] \n\t" \ - "psraw %[dest], %[dest], %[shift] \n\t" \ - "and " #reg ", %[dest], %[mask1] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box_16 */ -void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src0_ptr = src_ptr; - const uint16_t* src1_ptr = src_ptr + src_stride; - const uint16_t* src2_ptr = src_ptr + src_stride * 2; - const uint16_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x00000000ffffffffULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 0x04ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) - "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" - "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - - "punpcklhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddush %[dest0], %[dest0], %[src_lo] \n\t" - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddush %[dest1], %[dest1], %[src_hi] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleAddRow_16_MMI(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklhw %[src_lo], %[src], %[mask] \n\t" - "punpckhhw %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddw %[dest0], %[dest0], %[src_lo] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddw %[dest1], %[dest1], %[src_hi] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), - [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* src0_ptr = src_argb; - const uint8_t* src1_ptr = src_argb + src_stride; - - uint64_t src0, src1, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shift = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift] \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), - [ph] "f"(ph) - : "memory"); -} - -// Scales a single row of pixels using point sampling. -void ScaleARGBCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - const uint32_t* src_tmp; - - uint64_t dest, offset; - - const uint64_t shift0 = 16; - const uint64_t shift1 = 2; - - __asm__ volatile( - "1: \n\t" - "srav %[offset], %[x], %[shift0] \n\t" - "sllv %[offset], %[offset], %[shift1] \n\t" - "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" - "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" - "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[x], %[x], %[dx] \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - uint64_t src, dest0, dest1; - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklwd %[dest0], %[src], %[src] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest1], %[src], %[src] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVBaseTest.TestFixedDiv */ -int FixedDiv_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); - - return quotient; -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ -int FixedDiv1_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - const int val1 = 1; - const int64_t val11 = 0x00010001ULL; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "dsub %[num], %[num], %[val11] \n\t" - "dsub %[div], %[div], %[val1] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), - [shift] "r"(shift)); - - return quotient; -} - -// Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src2_ptr = src_ptr + src_stride; - - uint64_t src0, src1; - uint64_t dest, dest04, dest15, dest26, dest37; - uint64_t tmp0, tmp1, tmp2, tmp3; - - const uint64_t mask0 = 0x0003000900030009ULL; - const uint64_t mask1 = 0x0001000300010003ULL; - const uint64_t mask2 = 0x0009000300090003ULL; - const uint64_t mask3 = 0x0003000100030001ULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 4; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" - "pmaddhw %[dest04], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest04], %[dest04], %[dest] \n\t" - "paddw %[dest04], %[dest04], %[ph] \n\t" - "psrlw %[dest04], %[dest04], %[shift] \n\t" - - "pmaddhw %[dest15], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest15], %[dest15], %[dest] \n\t" - "paddw %[dest15], %[dest15], %[ph] \n\t" - "psrlw %[dest15], %[dest15], %[shift] \n\t" - - "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" - "pmaddhw %[dest26], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest26], %[dest26], %[dest] \n\t" - "paddw %[dest26], %[dest26], %[ph] \n\t" - "psrlw %[dest26], %[dest26], %[shift] \n\t" - - "pmaddhw %[dest37], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest37], %[dest37], %[dest] \n\t" - "paddw %[dest37], %[dest37], %[ph] \n\t" - "psrlw %[dest37], %[dest37], %[shift] \n\t" - - /* tmp0 = ( 00 04 02 06 ) */ - "packsswh %[tmp0], %[dest04], %[dest26] \n\t" - /* tmp1 = ( 01 05 03 07 ) */ - "packsswh %[tmp1], %[dest15], %[dest37] \n\t" - - /* tmp2 = ( 00 01 04 05 )*/ - "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" - /* tmp3 = ( 02 03 06 07 )*/ - "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" - - /* ( 00 01 02 03 ) */ - "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - /* ( 04 05 06 07 ) */ - "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), - [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) - : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) - : "memory"); -} - -void ScaleRowDown34_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - uint64_t src[2]; - uint64_t tmp[2]; - __asm__ volatile ( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "and %[tmp1], %[src0], %[mask1] \n\t" - "psrlw %[tmp0], %[src0], %[rmov] \n\t" - "psllw %[tmp0], %[tmp0], %[lmov1] \n\t" - "or %[src0], %[tmp0], %[tmp1] \n\t" - "punpckhwd %[tmp0], %[src0], %[src0] \n\t" - "psllw %[tmp1], %[tmp0], %[rmov] \n\t" - "or %[src0], %[src0], %[tmp1] \n\t" - "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t" - "pextrh %[tmp0], %[tmp0], %[zero] \n\t" - "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t" - "pextrh %[tmp0], %[src1], %[zero] \n\t" - "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t" - - "punpckhwd %[tmp0], %[src1], %[src1] \n\t" - "pextrh %[tmp1], %[tmp0], %[zero] \n\t" - "psrlw %[src1], %[src1], %[rmov] \n\t" - "psllw %[tmp1], %[tmp1], %[rmov8] \n\t" - "or %[src1], %[src1], %[tmp1] \n\t" - "and %[tmp0], %[tmp0], %[mask2] \n\t" - "or %[src1], %[src1], %[tmp0] \n\t" - - "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "bnez %[width], 1b \n\t" - - : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]), - [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1]) - : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst), - [lmov]"f"(0xc), [rmov]"f"(0x18), - [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8), - [zero]"f"(0x0), [mask2]"f"(0xff000000), - [width]"r"(dst_width), [lmov1]"f"(0x10) - : "memory" - ); -} -// clang-format on - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc index 65f986e93..1556071d0 100644 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc +++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_uv.cc @@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEUVFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; @@ -510,6 +515,11 @@ static void ScaleUVBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc deleted file mode 100644 index ea1f95c6c..000000000 --- a/libfenrir/src/main/jni/animation/libyuv/source/scale_win.cc +++ /dev/null @@ -1,1392 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -// Offsets for source bytes 0 to 9 -static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 0 to 10 -static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, - 8, 9, 9, 10, 10, 11, 12, 13}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; - -// Coefficients for source bytes 0 to 10 -static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; - -// Coefficients for source bytes 10 to 21 -static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; - -// Coefficients for source bytes 21 to 31 -static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; - -// Coefficients for source bytes 21 to 31 -static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; - -static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 0,1,2 -static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 3,4,5 -static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x3 and 2x3 -static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; - -// Arrange first value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; - -// Arrange second value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; - -// Arrange third value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x2 and 2x2 -static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; - -// Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add - paddw xmm1, xmm3 - psrlw xmm0, 1 - psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN2_AVX2 -// Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x1 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// For rounding, average = (sum + 2) / 4 -// becomes average((sum >> 1), 0) -// Blends 64x2 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add - vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 - vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN2_AVX2 - -// Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - ret - } -} - -// Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - movdqa xmm5, xmm4 - packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 - - wloop: - movdqu xmm0, [eax] // average rows - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 - paddw xmm1, xmm3 - movdqu xmm2, [eax + esi * 2] - movdqu xmm3, [eax + esi * 2 + 16] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 - paddw xmm1, xmm3 - movdqu xmm2, [eax + edi] - movdqu xmm3, [eax + edi + 16] - lea eax, [eax + 32] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 - paddw xmm1, xmm3 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN4_AVX2 -// Point samples 64 pixels to 16 pixels. -__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 - vpsrld ymm5, ymm5, 24 - vpslld ymm5, ymm5, 16 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x4 rectangle to 16x1. -__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 - vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 - vpackuswb ymm4, ymm4, ymm4 - - wloop: - vmovdqu ymm0, [eax] // average rows - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + esi * 2] - vmovdqu ymm3, [eax + esi * 2 + 32] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + edi] - vmovdqu ymm3, [eax + edi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 - vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN4_AVX2 - -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, xmmword ptr kShuf0 - movdqa xmm4, xmmword ptr kShuf1 - movdqa xmm5, xmmword ptr kShuf2 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 kRound34 - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, xmmword ptr kShuf38a - movdqa xmm5, xmmword ptr kShuf38b - - xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - sub ecx, 12 - jg xloop - - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAc - movdqa xmm3, xmmword ptr kShufAc3 - movdqa xmm4, xmmword ptr kScaleAc33 - pxor xmm5, xmm5 - - xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqu xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqu xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAb0 - movdqa xmm3, xmmword ptr kShufAb1 - movdqa xmm4, xmmword ptr kShufAb2 - movdqa xmm5, xmmword ptr kScaleAb2 - - xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 - movdqu xmm1, [eax + esi] - lea eax, [eax + 16] - pavgb xmm0, xmm1 - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - pxor xmm5, xmm5 - - // sum rows - xloop: - movdqu xmm3, [eax] // read 16 bytes - lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination - movdqu xmm1, [edx + 16] - movdqa xmm2, xmm3 - punpcklbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 16 - jg xloop - ret - } -} - -#ifdef HAS_SCALEADDROW_AVX2 -// Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - vpxor ymm5, ymm5, ymm5 - - // sum rows - xloop: - vmovdqu ymm3, [eax] // read 32 bytes - lea eax, [eax + 32] - vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck - vpunpcklbw ymm2, ymm3, ymm5 - vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words - vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 32 - jg xloop - - vzeroupper - ret - } -} -#endif // HAS_SCALEADDROW_AVX2 - -// Constant for making pixels signed to avoid pmaddubsw -// saturation. -static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Constant for making pixels unsigned and adding .5 for rounding. -static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; - -// Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 - psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. - movd ebx, xmm1 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit - paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits - movd ebx, xmm2 - mov [edi], bl - - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} - -// Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - ret - } -} - -// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop esi - ret - } -} - -// Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop ebx - ret - } -} - -// Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} - -// Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 // 4 pixels - jge xloop4 - - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - xloop99: - - pop esi - pop edi - ret - } -} - -// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. -// TODO(fbarchard): Port to Neon - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static const uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static const uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, xmmword ptr kShuffleColARGB - movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - xloop99: - - pop edi - pop esi - ret - } -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - idiv dword ptr [esp + 8] - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv1_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - sub eax, 0x00010001 - sbb edx, 0 - sub ecx, 1 - idiv ecx - ret - } -} -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp b/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp index 1cf07ebf1..c3a45a2a5 100644 --- a/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp +++ b/libfenrir/src/main/jni/audio/taglib/mpeg/id3v2/id3v2frame.cpp @@ -373,6 +373,7 @@ namespace std::make_pair("TSO2", "ALBUMARTISTSORT"), // non-standard, used by iTunes std::make_pair("TSRC", "ISRC"), std::make_pair("TSSE", "ENCODING"), + std::make_pair("TSST", "DISCSUBTITLE"), // URL frames std::make_pair("WCOP", "COPYRIGHTURL"), std::make_pair("WOAF", "FILEWEBPAGE"), diff --git a/libfenrir/src/main/jni/audio/taglib/taglib_config.h b/libfenrir/src/main/jni/audio/taglib/taglib_config.h index fa5c4f34a..98fb804ce 100644 --- a/libfenrir/src/main/jni/audio/taglib/taglib_config.h +++ b/libfenrir/src/main/jni/audio/taglib/taglib_config.h @@ -7,4 +7,6 @@ /* Defined if your compiler supports some safer version of vsprintf */ #define HAVE_VSNPRINTF 1 + +#define HAVE_ZLIB 1 #endif diff --git a/libfenrir/src/main/jni/thorvg/inc/thorvg.h b/libfenrir/src/main/jni/thorvg/inc/thorvg.h index 97990a8e4..a8f1f8296 100644 --- a/libfenrir/src/main/jni/thorvg/inc/thorvg.h +++ b/libfenrir/src/main/jni/thorvg/inc/thorvg.h @@ -161,19 +161,34 @@ enum class FillRule EvenOdd ///< A line from the point to a location outside the shape is drawn and its intersections with the path segments of the shape are counted. If the number of intersections is an odd number, the point is inside the shape. }; + /** * @brief Enumeration indicating the method used in the composition of two objects - the target and the source. + * + * In the case of Mask composition, you need to perform bit operations on two options - Mask Alpha and Mask Operation. + * Mask Alpha specifies the origin of the alpha channel, while Mask Operation specifies the masking operation. + * @code paint->composite(tvg::CompositeMethod::AlphaMask + tvg::CompositeMethod::AddMaskOp); @endcode + * + * @note If you don't specify the mask alpha, @c AlphaMask will be used. + * @note If you don't specify the mask method, @c AddMaskOp will be used. + * @warning Composition does not support multiple choices for both Mask Alpha and Mask Operation. + * @see Paint::composite() */ enum class CompositeMethod { - None = 0, ///< No composition is applied. - ClipPath, ///< The intersection of the source and the target is determined and only the resulting pixels from the source are rendered. - AlphaMask, ///< The pixels of the source and the target are alpha blended. As a result, only the part of the source, which alpha intersects with the target is visible. - InvAlphaMask, ///< The pixels of the source and the complement to the target's pixels are alpha blended. As a result, only the part of the source which alpha is not covered by the target is visible. - LumaMask, ///< The source pixels are converted to the grayscale (luma value) and alpha blended with the target. As a result, only the part of the source, which intersects with the target is visible. @since 0.9 - InvLumaMask ///< The source pixels are converted to the grayscale (luma value) and the complement to the target's pixels are alpha blended. As a result, only the part of the source which grayscale is not covered by the target is visible. @BETA_API + None = 0, ///< No composition is applied. + ClipPath, ///< The intersection of the source and the target is determined and only the resulting pixels from the source are rendered. + AlphaMask, ///< Mask Alpha: Use the compositing target's pixels as an alpha value. + InvAlphaMask, ///< Mask Alpha: Use the complement to the compositing target's pixels as an alpha. + LumaMask, ///< Mask Alpha: Use the grayscale (0.2125R + 0.7154G + 0.0721*B) of the compositing target's pixels. @since 0.9 + InvLumaMask, ///< Mask Alpha: Use the grayscale (0.2125R + 0.7154G + 0.0721*B) of the complement to the compositing target's pixels. @BETA_API + AddMask, ///< Mask Operation: Combines the source and target pixels using Mask Alpha. @BETA_API + SubtractMask, ///< Mask Operation: Subtracts the target color from the source color while considering their respective Mask Alpha. @BETA_API + IntersectMask, ///< Mask Operation: Computes the result by taking the minimum value between the Mask Alpha and the target alpha and multiplies it with the source color. @BETA_API + DifferenceMask ///< Mask Operation: Calculates the absolute difference between the source color and the target color multiplied by the complement of the Mask Alpha. @BETA_API }; + /** * @brief Enumeration specifying the engine type used for the graphics backend. For multiple backends bitwise operation is allowed. */ @@ -1352,8 +1367,8 @@ class TVG_API SwCanvas final : public Canvas */ enum Colorspace { - ABGR8888 = 0, ///< The channels are joined in the order: alpha, blue, green, red. Colors are alpha-premultiplied. - ARGB8888, ///< The channels are joined in the order: alpha, red, green, blue. Colors are alpha-premultiplied. + ABGR8888 = 0, ///< The channels are joined in the order: alpha, blue, green, red. Colors are alpha-premultiplied. (a << 24 | b << 16 | g << 8 | r) + ARGB8888, ///< The channels are joined in the order: alpha, red, green, blue. Colors are alpha-premultiplied. (a << 24 | r << 16 | g << 8 | b) ABGR8888S, ///< @BETA_API The channels are joined in the order: alpha, blue, green, red. Colors are un-alpha-premultiplied. ARGB8888S, ///< @BETA_API The channels are joined in the order: alpha, red, green, blue. Colors are un-alpha-premultiplied. }; @@ -1636,6 +1651,7 @@ std::unique_ptr cast(Paint* paint) return std::unique_ptr(static_cast(paint)); } + /** * @brief The cast() function is a utility function used to cast a 'Fill' to type 'T'. * @@ -1648,6 +1664,17 @@ std::unique_ptr cast(Fill* fill) } +/** + * @brief The operator() function is the OR function used to combine Mask Alpha & Mask Operation + * + * @BETA_API + */ +constexpr CompositeMethod operator+(CompositeMethod a, CompositeMethod b) +{ + return CompositeMethod(int(a) | int(b)); +} + + /** @}*/ } //namespace diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h index 5977e9321..0181c8c05 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwCommon.h @@ -248,7 +248,8 @@ struct SwBlender SwAlpha alpha(CompositeMethod method) { - return alphas[(int)(method) - 2]; //0: None, 1: ClipPath + auto idx = (int)(method) - 2; //0: None, 1: ClipPath + return alphas[idx > 3 ? 0 : idx]; //CompositeMethod has only four Matting methods. } }; @@ -287,9 +288,14 @@ static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a) ((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff)); } -static inline uint32_t INTERPOLATE(uint32_t a, uint32_t c0, uint32_t c1) +static inline uint32_t INTERPOLATE(uint32_t s, uint32_t d, uint8_t a) { - return (((((((c0 >> 8) & 0xff00ff) - ((c1 >> 8) & 0xff00ff)) * a) + (c1 & 0xff00ff00)) & 0xff00ff00) + ((((((c0 & 0xff00ff) - (c1 & 0xff00ff)) * a) >> 8) + (c1 & 0xff00ff)) & 0xff00ff)); + return (((((((s >> 8) & 0xff00ff) - ((d >> 8) & 0xff00ff)) * a) + (d & 0xff00ff00)) & 0xff00ff00) + ((((((s & 0xff00ff) - (d & 0xff00ff)) * a) >> 8) + (d & 0xff00ff)) & 0xff00ff)); +} + +static inline uint8_t INTERPOLATE8(uint8_t s, uint8_t d, uint8_t a) +{ + return ((s * a + 0xff) >> 8) + ((d * ~a + 0xff) >> 8); } static inline SwCoord HALF_STROKE(float width) @@ -297,6 +303,61 @@ static inline SwCoord HALF_STROKE(float width) return TO_SWCOORD(width * 0.5f); } +static inline uint8_t MULTIPLY(uint8_t c, uint8_t a) +{ + return ((c * a + 0xff) >> 8); +} + +static inline uint8_t ALPHA(uint32_t c) +{ + return (c >> 24); +} + +static inline uint8_t IALPHA(uint32_t c) +{ + return (~c >> 24); +} + + +typedef uint32_t(*SwBlendOp)(uint32_t s, uint32_t d, uint8_t a); //src, dst, alpha + +static inline uint32_t opAlphaBlend(uint32_t s, uint32_t d, uint8_t a) +{ + auto t = ALPHA_BLEND(s, a); + return t + ALPHA_BLEND(d, IALPHA(t)); +} + +static inline uint32_t opBlend(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a) +{ + return s + ALPHA_BLEND(d, IALPHA(s)); +} + +static inline uint32_t opAddMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a) +{ + return opBlend(s, d, a); +} + +static inline uint32_t opSubMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a) +{ + return ALPHA_BLEND(d, IALPHA(s)); +} + +static inline uint32_t opIntMask(TVG_UNUSED uint32_t s, uint32_t d, uint8_t a) +{ + return ALPHA_BLEND(d, a); +} + +static inline uint32_t opDifMask(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a) +{ + return ALPHA_BLEND(s, IALPHA(d)) + ALPHA_BLEND(d, IALPHA(s)); +} + +static inline uint32_t opInterpolate(uint32_t s, uint32_t d, uint8_t a) +{ + return INTERPOLATE(s, d, a); +} + + int64_t mathMultiply(int64_t a, int64_t b); int64_t mathDivide(int64_t a, int64_t b); int64_t mathMulDiv(int64_t a, int64_t b, int64_t c); @@ -344,8 +405,10 @@ void imageFree(SwImage* image); bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix* transform, SwSurface* surface, uint32_t opacity, bool ctable); void fillReset(SwFill* fill); void fillFree(SwFill* fill); -void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len); -void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len); +void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a); //blending ver. +void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity); //masking ver. +void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a); //blending ver. +void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity); //masking ver. SwRleData* rleRender(SwRleData* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias); SwRleData* rleRender(const SwBBox* bbox); diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp index 694bc3523..1a432ea86 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwFill.cpp @@ -80,7 +80,7 @@ static bool _updateColorTable(SwFill* fill, const Fill* fdata, const SwSurface* auto dist = static_cast(255 * t); auto dist2 = 255 - dist; - auto color = INTERPOLATE(dist2, rgba, rgba2); + auto color = INTERPOLATE(rgba, rgba2, dist2); fill->ctable[i] = ALPHA_BLEND((color | 0xff000000), (color >> 24)); ++i; @@ -233,7 +233,7 @@ static inline uint32_t _pixel(const SwFill* fill, float pos) /* External Class Implementation */ /************************************************************************/ -void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len) +void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity) { auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX; auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY; @@ -244,16 +244,124 @@ void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative; auto det = rx * rx + ry * ry; - for (uint32_t i = 0 ; i < len ; ++i) { - *dst = _pixel(fill, sqrtf(det)); - ++dst; - det += detFirstDerivative; - detFirstDerivative += detSecondDerivative; + if (opacity == 255) { + for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) { + *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, alpha(cmp)); + det += detFirstDerivative; + detFirstDerivative += detSecondDerivative; + } + } else { + for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) { + *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, MULTIPLY(opacity, alpha(cmp))); + det += detFirstDerivative; + detFirstDerivative += detSecondDerivative; + } + } +} + + +void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a) +{ + auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX; + auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY; + + // detSecondDerivative = d(detFirstDerivative)/dx = d( d(det)/dx )/dx + auto detSecondDerivative = fill->radial.detSecDeriv; + // detFirstDerivative = d(det)/dx + auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative; + auto det = rx * rx + ry * ry; + + if (op) { + for (uint32_t i = 0 ; i < len ; ++i, ++dst) { + *dst = op(_pixel(fill, sqrtf(det)), *dst, a); + det += detFirstDerivative; + detFirstDerivative += detSecondDerivative; + } + } else { + for (uint32_t i = 0 ; i < len ; ++i, ++dst) { + *dst = _pixel(fill, sqrtf(det)); + det += detFirstDerivative; + detFirstDerivative += detSecondDerivative; + } } } -void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len) +void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity) +{ + //Rotation + float rx = x + 0.5f; + float ry = y + 0.5f; + float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1); + float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1); + + if (opacity == 255) { + if (mathZero(inc)) { + auto color = _fixedPixel(fill, static_cast(t * FIXPT_SIZE)); + for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) { + *dst = opAlphaBlend(color, *dst, alpha(cmp)); + } + return; + } + + auto vMax = static_cast(INT32_MAX >> (FIXPT_BITS + 1)); + auto vMin = -vMax; + auto v = t + (inc * len); + + //we can use fixed point math + if (v < vMax && v > vMin) { + auto t2 = static_cast(t * FIXPT_SIZE); + auto inc2 = static_cast(inc * FIXPT_SIZE); + for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) { + *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, alpha(cmp)); + t2 += inc2; + } + //we have to fallback to float math + } else { + uint32_t counter = 0; + while (counter++ < len) { + *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, alpha(cmp)); + ++dst; + t += inc; + cmp += csize; + } + } + } else { + if (mathZero(inc)) { + auto color = _fixedPixel(fill, static_cast(t * FIXPT_SIZE)); + for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) { + *dst = opAlphaBlend(color, *dst, MULTIPLY(alpha(cmp), opacity)); + } + return; + } + + auto vMax = static_cast(INT32_MAX >> (FIXPT_BITS + 1)); + auto vMin = -vMax; + auto v = t + (inc * len); + + //we can use fixed point math + if (v < vMax && v > vMin) { + auto t2 = static_cast(t * FIXPT_SIZE); + auto inc2 = static_cast(inc * FIXPT_SIZE); + for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) { + *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, MULTIPLY(alpha(cmp), opacity)); + t2 += inc2; + } + //we have to fallback to float math + } else { + uint32_t counter = 0; + while (counter++ < len) { + *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, MULTIPLY(opacity, alpha(cmp))); + ++dst; + t += inc; + cmp += csize; + } + } + } +} + + +void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a) { //Rotation float rx = x + 0.5f; @@ -263,7 +371,13 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, if (mathZero(inc)) { auto color = _fixedPixel(fill, static_cast(t * FIXPT_SIZE)); - rasterRGBA32(dst, color, 0, len); + if (op) { + for (uint32_t i = 0; i < len; ++i, ++dst) { + *dst = op(color, *dst, a); + } + } else { + rasterRGBA32(dst, color, 0, len); + } return; } @@ -271,22 +385,41 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, auto vMin = -vMax; auto v = t + (inc * len); - //we can use fixed point math - if (v < vMax && v > vMin) { - auto t2 = static_cast(t * FIXPT_SIZE); - auto inc2 = static_cast(inc * FIXPT_SIZE); - for (uint32_t j = 0; j < len; ++j) { - *dst = _fixedPixel(fill, t2); - ++dst; - t2 += inc2; + if (op) { + //we can use fixed point math + if (v < vMax && v > vMin) { + auto t2 = static_cast(t * FIXPT_SIZE); + auto inc2 = static_cast(inc * FIXPT_SIZE); + for (uint32_t j = 0; j < len; ++j, ++dst) { + *dst = op(_fixedPixel(fill, t2), *dst, a); + t2 += inc2; + } + //we have to fallback to float math + } else { + uint32_t counter = 0; + while (counter++ < len) { + *dst = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, a); + ++dst; + t += inc; + } } - //we have to fallback to float math } else { - uint32_t counter = 0; - while (counter++ < len) { - *dst = _pixel(fill, t / GRADIENT_STOP_SIZE); - ++dst; - t += inc; + //we can use fixed point math + if (v < vMax && v > vMin) { + auto t2 = static_cast(t * FIXPT_SIZE); + auto inc2 = static_cast(inc * FIXPT_SIZE); + for (uint32_t j = 0; j < len; ++j, ++dst) { + *dst = _fixedPixel(fill, t2); + t2 += inc2; + } + //we have to fallback to float math + } else { + uint32_t counter = 0; + while (counter++ < len) { + *dst = _pixel(fill, t / GRADIENT_STOP_SIZE); + ++dst; + t += inc; + } } } } diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp index ef350101d..a94bd52f5 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRaster.cpp @@ -37,30 +37,53 @@ /************************************************************************/ constexpr auto DOWN_SCALE_TOLERANCE = 0.5f; -template -static inline T _multiply(T c, T a) +struct FillLinear { - return ((c * a + 0xff) >> 8); -} + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a) + { + fillLinear(fill, dst, y, x, len, op, a); + } -static inline uint32_t _alpha(uint32_t c) -{ - return (c >> 24); -} + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len) + { + fillLinear(fill, dst, y, x, len, nullptr, 255); + } + + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity) + { + fillLinear(fill, dst, y, x, len, cmp, alpha, csize, opacity); + } +}; -static inline uint32_t _ialpha(uint32_t c) +struct FillRadial { - return (~c >> 24); -} + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a) + { + fillRadial(fill, dst, y, x, len, op, a); + } + + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len) + { + fillRadial(fill, dst, y, x, len, nullptr, 255); + } + + void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity) + { + fillRadial(fill, dst, y, x, len, cmp, alpha, csize, opacity); + } +}; + +static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity = 255); -static inline uint8_t _alpha(uint8_t* a) + +static inline uint8_t ALPHA(uint8_t* a) { return *a; } -static inline uint8_t _ialpha(uint8_t* a) +static inline uint8_t IALPHA(uint8_t* a) { return ~(*a); } @@ -104,12 +127,6 @@ static inline uint32_t _argbJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a) } -#include "tvgSwRasterTexmap.h" -#include "tvgSwRasterC.h" -#include "tvgSwRasterAvx.h" -#include "tvgSwRasterNeon.h" - - static inline bool _compositing(const SwSurface* surface) { if (!surface->compositor || (int)surface->compositor->method <= (int)CompositeMethod::ClipPath) return false; @@ -117,6 +134,81 @@ static inline bool _compositing(const SwSurface* surface) } +static inline bool _matting(const SwSurface* surface) +{ + if ((int)surface->compositor->method < (int)CompositeMethod::AddMask) return true; + else return false; +} + + +static inline bool _masking(const SwSurface* surface) +{ + if ((int)surface->compositor->method >= (int)CompositeMethod::AddMask) return true; + else return false; +} + + +struct AddMaskOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + return s + ALPHA_BLEND(d, a); + } +}; + + +struct SubMaskOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + return ALPHA_BLEND(d, a); + } +}; + + +struct DifMaskOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + return ALPHA_BLEND(s, IALPHA(d)) + ALPHA_BLEND(d, a); + } +}; + + +struct AddMaskAOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + return INTERPOLATE(s, d, a); + } +}; + + +struct SubMaskAOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + return ALPHA_BLEND(d, IALPHA(ALPHA_BLEND(s, a))); + } +}; + + +struct DifMaskAOp +{ + uint32_t operator()(uint32_t s, uint32_t d, uint8_t a) + { + auto t = ALPHA_BLEND(s, a); + return ALPHA_BLEND(t, IALPHA(d)) + ALPHA_BLEND(d, IALPHA(t)); + } +}; + + +#include "tvgSwRasterTexmap.h" +#include "tvgSwRasterC.h" +#include "tvgSwRasterAvx.h" +#include "tvgSwRasterNeon.h" + + static inline uint32_t _halfScale(float scale) { auto halfScale = static_cast(0.5f / scale); @@ -125,7 +217,7 @@ static inline uint32_t _halfScale(float scale) } //Bilinear Interpolation -static uint32_t _interpUpScaler(const uint32_t *img, uint32_t w, uint32_t h, float sx, float sy) +static uint32_t _interpUpScaler(const uint32_t *img, TVG_UNUSED uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, TVG_UNUSED uint32_t n) { auto rx = (uint32_t)(sx); auto ry = (uint32_t)(sy); @@ -142,13 +234,15 @@ static uint32_t _interpUpScaler(const uint32_t *img, uint32_t w, uint32_t h, flo auto c3 = img[rx2 + ry2 * w]; auto c4 = img[rx + ry2 * w]; - return INTERPOLATE(dy, INTERPOLATE(dx, c3, c4), INTERPOLATE(dx, c2, c1)); + return INTERPOLATE(INTERPOLATE(c3, c4, dx), INTERPOLATE(c2, c1, dx), dy); } //2n x 2n Mean Kernel -static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t w, uint32_t h, uint32_t rx, uint32_t ry, uint32_t n) +static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, uint32_t n) { + uint32_t rx = sx; + uint32_t ry = sy; uint32_t c[4] = {0, 0, 0, 0}; auto n2 = n * n; auto src = img + rx - n + (ry - n) * stride; @@ -174,21 +268,96 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len) { - cRasterPixels(dst, val, offset, len); + cRasterPixels(dst, val, offset, len); } /************************************************************************/ /* Rect */ /************************************************************************/ -static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a, SwAlpha alpha) +template +static void _rasterMaskedRectDup(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ + auto w = static_cast(region.max.x - region.min.x); + auto h = static_cast(region.max.y - region.min.y); + auto cbuffer = surface->compositor->image.buf32 + (region.min.y * surface->compositor->image.stride + region.min.x); //compositor buffer + auto cstride = surface->compositor->image.stride; + auto color = surface->blender.join(r, g, b, a); + auto ialpha = 255 - a; + + for (uint32_t y = 0; y < h; ++y) { + auto cmp = cbuffer; + for (uint32_t x = 0; x < w; ++x, ++cmp) { + *cmp = maskOp()(color, *cmp, ialpha); + } + cbuffer += cstride; + } +} + + +static void _rasterMaskedRectInt(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ + auto w = static_cast(region.max.x - region.min.x); + auto h = static_cast(region.max.y - region.min.y); + auto cstride = surface->compositor->image.stride; + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = surface->compositor->image.buf32 + (y * cstride + surface->compositor->bbox.min.x); + if (y == region.min.y) { + for (uint32_t y2 = y; y2 < region.max.y; ++y2) { + auto tmp = cmp; + auto x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (x == region.min.x) { + for (uint32_t i = 0; i < w; ++i, ++tmp) { + *tmp = ALPHA_BLEND(*tmp, a); + } + x += w; + } else { + *tmp = 0; + ++tmp; + ++x; + } + } + cmp += cstride; + } + y += (h - 1); + } else { + rasterRGBA32(cmp, 0x00000000, 0, w); + cmp += cstride; + } + } +} + + +static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ + //32bit channels composition + if (surface->channelSize != sizeof(uint32_t)) return false; + + auto method = surface->compositor->method; + + TVGLOG("SW_ENGINE", "Masked(%d) Rect [Region: %lu %lu %lu %lu]", (int)method, region.min.x, region.min.y, region.max.x - region.max.y, region.min.y); + if (method == CompositeMethod::AddMask) _rasterMaskedRectDup(surface, region, r, g, b, a); + else if (method == CompositeMethod::SubtractMask) _rasterMaskedRectDup(surface, region, r, g, b, a); + else if (method == CompositeMethod::DifferenceMask) _rasterMaskedRectDup(surface, region, r, g, b, a); + else if (method == CompositeMethod::IntersectMask) _rasterMaskedRectInt(surface, region, r, g, b, a); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterMattedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) { auto w = static_cast(region.max.x - region.min.x); auto h = static_cast(region.max.y - region.min.y); auto csize = surface->compositor->image.channelSize; auto cbuffer = surface->compositor->image.buf8 + ((region.min.y * surface->compositor->image.stride + region.min.x) * csize); //compositor buffer + auto alpha = surface->blender.alpha(surface->compositor->method); - TVGLOG("SW_ENGINE", "Masked Rect [Region: %lu %lu %u %u]", region.min.x, region.min.y, w, h); + TVGLOG("SW_ENGINE", "Matted(%d) Rect [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h); //32bits channels if (surface->channelSize == sizeof(uint32_t)) { @@ -196,10 +365,9 @@ static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; for (uint32_t y = 0; y < h; ++y) { auto dst = &buffer[y * surface->stride]; - auto cmp = &cbuffer[y * surface->stride * csize]; + auto cmp = &cbuffer[y * surface->compositor->image.stride * csize]; for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) { - auto tmp = ALPHA_BLEND(color, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + *dst = INTERPOLATE(color, *dst, alpha(cmp)); } } //8bits grayscale @@ -207,10 +375,9 @@ static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x; for (uint32_t y = 0; y < h; ++y) { auto dst = &buffer[y * surface->stride]; - auto cmp = &cbuffer[y * surface->stride * csize]; + auto cmp = &cbuffer[y * surface->compositor->image.stride * csize]; for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) { - auto tmp = _multiply(a, alpha(cmp)); - *dst = tmp + _multiply(*dst, _ialpha(tmp)); + *dst = INTERPOLATE8(a, *dst, alpha(cmp)); } } } @@ -230,22 +397,25 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r for (uint32_t y = 0; y < h; ++y) { rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w); } + return true; //8bits grayscale - } else if (surface->channelSize == sizeof(uint8_t)) { + } + if (surface->channelSize == sizeof(uint8_t)) { auto buffer = surface->buf8 + (region.min.y * surface->stride); for (uint32_t y = 0; y < h; ++y) { _rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w); } + return true; } - return true; + return false; } static bool _rasterRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a) { if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterMaskedRect(surface, region, r, g, b, a, alpha); + if (_matting(surface)) return _rasterMattedRect(surface, region, r, g, b, a); + else return _rasterMaskedRect(surface, region, r, g, b, a); } else { if (a == 255) { return _rasterSolidRect(surface, region, r, g, b); @@ -267,17 +437,89 @@ static bool _rasterRect(SwSurface* surface, const SwBBox& region, uint8_t r, uin /* Rle */ /************************************************************************/ -static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a, SwAlpha alpha) +template +static void _rasterMaskedRleDup(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a) { - TVGLOG("SW_ENGINE", "Masked Rle"); + auto span = rle->spans; + auto cbuffer = surface->compositor->image.buf32; + auto cstride = surface->compositor->image.stride; + auto color = surface->blender.join(r, g, b, a); + uint32_t src; + + for (uint32_t i = 0; i < rle->size; ++i, ++span) { + auto cmp = &cbuffer[span->y * cstride + span->x]; + if (span->coverage == 255) src = color; + else src = ALPHA_BLEND(color, span->coverage); + auto ialpha = IALPHA(src); + for (auto x = 0; x < span->len; ++x, ++cmp) { + *cmp = maskOp()(src, *cmp, ialpha); + } + } +} + +static void _rasterMaskedRleInt(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ auto span = rle->spans; + auto cbuffer = surface->compositor->image.buf32; + auto cstride = surface->compositor->image.stride; + auto color = surface->blender.join(r, g, b, a); uint32_t src; + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = &cbuffer[y * cstride]; + uint32_t x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) { + if (span->coverage == 255) src = color; + else src = ALPHA_BLEND(color, span->coverage); + auto alpha = ALPHA(src); + for (uint32_t i = 0; i < span->len; ++i) { + cmp[x + i] = ALPHA_BLEND(cmp[x + i], alpha); + } + x += span->len; + ++span; + } else { + cmp[x] = 0; + ++x; + } + } + } +} + + +static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ + TVGLOG("SW_ENGINE", "Masked(%d) Rle", (int)surface->compositor->method); + + //32bit channels composition + if (surface->channelSize != sizeof(uint32_t)) return false; + + auto method = surface->compositor->method; + + if (method == CompositeMethod::AddMask) _rasterMaskedRleDup(surface, rle, r, g, b, a); + else if (method == CompositeMethod::SubtractMask) _rasterMaskedRleDup(surface, rle, r, g, b, a); + else if (method == CompositeMethod::DifferenceMask) _rasterMaskedRleDup(surface, rle, r, g, b, a); + else if (method == CompositeMethod::IntersectMask) _rasterMaskedRleInt(surface, rle, r, g, b, a); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterMattedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a) +{ + TVGLOG("SW_ENGINE", "Matted(%d) Rle", (int)surface->compositor->method); + + auto span = rle->spans; auto cbuffer = surface->compositor->image.buf8; auto csize = surface->compositor->image.channelSize; + auto alpha = surface->blender.alpha(surface->compositor->method); //32bit channels if (surface->channelSize == sizeof(uint32_t)) { + uint32_t src; auto color = surface->blender.join(r, g, b, a); for (uint32_t i = 0; i < rle->size; ++i, ++span) { auto dst = &surface->buf32[span->y * surface->stride + span->x]; @@ -285,24 +527,26 @@ static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint if (span->coverage == 255) src = color; else src = ALPHA_BLEND(color, span->coverage); for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) { - auto tmp = ALPHA_BLEND(src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + *dst = INTERPOLATE(src, *dst, alpha(cmp)); } } + return true; + } //8bit grayscale - } else if (surface->channelSize == sizeof(uint8_t)) { + if (surface->channelSize == sizeof(uint8_t)) { + uint8_t src; for (uint32_t i = 0; i < rle->size; ++i, ++span) { auto dst = &surface->buf8[span->y * surface->stride + span->x]; auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize]; if (span->coverage == 255) src = a; - else src = _multiply(a, span->coverage); + else src = MULTIPLY(a, span->coverage); for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) { - auto tmp = _multiply(src, alpha(cmp)); - *dst = tmp + _multiply(*dst, _ialpha(tmp)); + *dst = INTERPOLATE8(src, *dst, alpha(cmp)); } } + return true; } - return true; + return false; } @@ -328,14 +572,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r, //8bit grayscale } else if (surface->channelSize == sizeof(uint8_t)) { for (uint32_t i = 0; i < rle->size; ++i, ++span) { - if (span->coverage == 255) { - _rasterGrayscale8(surface->buf8 + span->y * surface->stride, 255, span->x, span->len); - } else { - auto dst = &surface->buf8[span->y * surface->stride + span->x]; - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = span->coverage; - } - } + _rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len); } } return true; @@ -347,8 +584,8 @@ static bool _rasterRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, if (!rle) return false; if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterMaskedRle(surface, rle, r, g, b, a, alpha); + if (_matting(surface)) return _rasterMattedRle(surface, rle, r, g, b, a); + else return _rasterMaskedRle(surface, rle, r, g, b, a); } else { if (a == 255) { return _rasterSolidRle(surface, rle, r, g, b); @@ -372,156 +609,142 @@ static bool _rasterRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, static bool _transformedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, uint32_t opacity) { - if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterTexmapPolygon(surface, image, transform, nullptr, opacity, alpha); - } else { - return _rasterTexmapPolygon(surface, image, transform, nullptr, opacity, nullptr); + auto ret = _rasterTexmapPolygon(surface, image, transform, nullptr, opacity); + + //Masking Composition + if (_compositing(surface) && _masking(surface)) { + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); } - return false; + + return ret; + } + /************************************************************************/ /* RLE Scaled RGBA Image */ /************************************************************************/ -static bool _rasterScaledMaskedTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale, SwAlpha alpha) +template +static void _rasterScaledMaskedRleRGBAImageDup(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { - TVGLOG("SW_ENGINE", "Scaled Masked Translucent Rle Image"); - + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; auto span = image->rle->spans; - auto csize = surface->compositor->image.channelSize; - //Center (Down-Scaled) - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize]; - auto a = _multiply(span->coverage, opacity); - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), a); - auto tmp = ALPHA_BLEND(src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { + auto sy = span->y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto cmp = &surface->compositor->image.buf32[span->y * surface->compositor->image.stride + span->x]; + auto a = MULTIPLY(span->coverage, opacity); + if (a == 255) { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++cmp) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *cmp = maskOp()(src, *cmp, 255); } - } - //Center (Up-Scaled) - } else { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = span->y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize]; - auto a = _multiply(span->coverage, opacity); - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { + } else { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++cmp) { auto sx = x * itransform->e11 + itransform->e13; if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), a); - auto tmp = ALPHA_BLEND(src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *cmp = amaskOp()(src, *cmp, a); } } } - return true; } -static bool _rasterScaledMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale, SwAlpha alpha) +static void _rasterScaledMaskedRleRGBAImageInt(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { - TVGLOG("SW_ENGINE", "Scaled Masked Rle Image"); - + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; auto span = image->rle->spans; - auto csize = surface->compositor->image.channelSize; - - //Center (Down-Scaled) - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize]; - if (span->coverage == 255) { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto tmp = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } else { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), span->coverage); - auto tmp = ALPHA_BLEND(src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } - } - //Center (Up-Scaled) - } else { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = span->y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize]; - if (span->coverage == 255) { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { - auto sx = x * itransform->e11 + itransform->e13; - if ((uint32_t)sx >= image->w) continue; - auto tmp = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + auto cbuffer = surface->compositor->image.buf32; + auto cstride = surface->compositor->image.stride; + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = &cbuffer[y * cstride]; + for (uint32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x) { + if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) { + auto sy = span->y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto alpha = MULTIPLY(span->coverage, opacity); + if (alpha == 255) { + for (uint32_t i = 0; i < span->len; ++i) { + auto sx = (x + i) * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(src)); + } + } else { + for (uint32_t i = 0; i < span->len; ++i) { + auto sx = (x + i) * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(ALPHA_BLEND(src, alpha))); + } } + x += span->len - 1; + ++span; } else { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { - auto sx = x * itransform->e11 + itransform->e13; - if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), span->coverage); - auto tmp = ALPHA_BLEND(src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } + cmp[x] = 0; } } } - return true; } -static bool _rasterScaledTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) +static bool _rasterScaledMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { + TVGLOG("SW_ENGINE", "Scaled Masked(%d) Rle Image", (int)surface->compositor->method); + + auto method = surface->compositor->method; + + if (method == CompositeMethod::AddMask) _rasterScaledMaskedRleRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::SubtractMask) _rasterScaledMaskedRleRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRleRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRleRGBAImageInt(surface, image, itransform, region, opacity, halfScale); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterScaledMattedRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) +{ + TVGLOG("SW_ENGINE", "Scaled Matted(%d) Rle Image", (int)surface->compositor->method); + auto span = image->rle->spans; + auto csize = surface->compositor->image.channelSize; + auto alpha = surface->blender.alpha(surface->compositor->method); - //Center (Down-Scaled) - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto alpha = _multiply(span->coverage, opacity); - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; + + for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { + auto sy = span->y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto dst = &surface->buf32[span->y * surface->stride + span->x]; + auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize]; + auto a = MULTIPLY(span->coverage, opacity); + if (a == 255) { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto tmp = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp)); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); } - } - //Center (Up-Scaled) - } else { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = span->y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto alpha = _multiply(span->coverage, opacity); - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { + } else { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst, cmp += csize) { auto sx = x * itransform->e11 + itransform->e13; if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + auto tmp = ALPHA_BLEND(src, MULTIPLY(alpha(cmp), a)); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); } } } + return true; } @@ -530,48 +753,26 @@ static bool _rasterScaledRleRGBAImage(SwSurface* surface, const SwImage* image, { auto span = image->rle->spans; - //Center (Down-Scaled) - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = (uint32_t)(span->y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - if (span->coverage == 255) { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = _interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); - } - } else { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), span->coverage); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); - } + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; + + for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { + auto sy = span->y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto dst = &surface->buf32[span->y * surface->stride + span->x]; + auto alpha = MULTIPLY(span->coverage, opacity); + if (alpha == 255) { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *dst = src + ALPHA_BLEND(*dst, IALPHA(src)); } - } - //Center (Up-Scaled) - } else { - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto sy = span->y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - if (span->coverage == 255) { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { - auto sx = x * itransform->e11 + itransform->e13; - if ((uint32_t)sx >= image->w) continue; - auto src = _interpUpScaler(image->buf32, image->w, image->h, sx, sy); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); - } - } else { - for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { - auto sx = x * itransform->e11 + itransform->e13; - if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), span->coverage); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); - } + } else { + for (uint32_t x = static_cast(span->x); x < static_cast(span->x) + span->len; ++x, ++dst) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha); + *dst = src + ALPHA_BLEND(*dst, IALPHA(src)); } } } @@ -590,12 +791,10 @@ static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const auto halfScale = _halfScale(image->scale); if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - if (opacity == 255) return _rasterScaledMaskedRleRGBAImage(surface, image, &itransform, region, halfScale, alpha); - else return _rasterScaledMaskedTranslucentRleRGBAImage(surface, image, &itransform, region, opacity, halfScale, alpha); + if (_matting(surface)) _rasterScaledMattedRleRGBAImage(surface, image, &itransform, region, opacity, halfScale); + else _rasterScaledMaskedRleRGBAImage(surface, image, &itransform, region, opacity, halfScale); } else { - if (opacity == 255) return _rasterScaledRleRGBAImage(surface, image, &itransform, region, opacity, halfScale); - else return _rasterScaledTranslucentRleRGBAImage(surface, image, &itransform, region, opacity, halfScale); + return _rasterScaledRleRGBAImage(surface, image, &itransform, region, opacity, halfScale); } return false; } @@ -605,56 +804,104 @@ static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const /* RLE Direct RGBA Image */ /************************************************************************/ -static bool _rasterDirectMaskedTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity, SwAlpha alpha) +template +static void _rasterDirectMaskedRleRGBAImageDup(SwSurface* surface, const SwImage* image, uint32_t opacity) { - TVGLOG("SW_ENGINE", "Direct Masked Rle Image"); - auto span = image->rle->spans; - auto csize = surface->compositor->image.channelSize; - auto cbuffer = surface->compositor->image.buf8; + auto cbuffer = surface->compositor->image.buf32; + auto ctride = surface->compositor->image.stride; for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize]; - auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); - auto a = _multiply(span->coverage, opacity); - if (a == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) { - auto tmp = ALPHA_BLEND(*img, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + auto src = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); + auto cmp = &cbuffer[span->y * ctride + span->x]; + auto alpha = MULTIPLY(span->coverage, opacity); + if (alpha == 255) { + for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) { + *cmp = maskOp()(*src, *cmp, IALPHA(*src)); } } else { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) { - auto tmp = ALPHA_BLEND(*img, _multiply(a, alpha(cmp))); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) { + *cmp = amaskOp()(*src, *cmp, alpha); } } } - return true; } -static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, SwAlpha alpha) +static void _rasterDirectMaskedRleRGBAImageInt(SwSurface* surface, const SwImage* image, uint32_t opacity) +{ + auto span = image->rle->spans; + auto cbuffer = surface->compositor->image.buf32; + auto ctride = surface->compositor->image.stride; + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = &cbuffer[y * ctride]; + auto x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) { + auto alpha = MULTIPLY(span->coverage, opacity); + auto src = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); + if (alpha == 255) { + for (uint32_t i = 0; i < span->len; ++i, ++src) { + cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(*src)); + } + } else { + for (uint32_t i = 0; i < span->len; ++i, ++src) { + auto t = ALPHA_BLEND(*src, alpha); + cmp[x + i] = ALPHA_BLEND(cmp[x + i], ALPHA(t)); + } + } + x += span->len; + ++span; + } else { + cmp[x] = 0; + ++x; + } + } + } +} + + +static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) +{ + TVGLOG("SW_ENGINE", "Direct Masked(%d) Rle Image", (int)surface->compositor->method); + + auto method = surface->compositor->method; + + if (method == CompositeMethod::AddMask) _rasterDirectMaskedRleRGBAImageDup(surface, image, opacity); + else if (method == CompositeMethod::SubtractMask) _rasterDirectMaskedRleRGBAImageDup(surface, image, opacity); + else if (method == CompositeMethod::DifferenceMask) _rasterDirectMaskedRleRGBAImageDup(surface, image, opacity); + else if (method == CompositeMethod::IntersectMask) _rasterDirectMaskedRleRGBAImageInt(surface, image, opacity); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterDirectMattedRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) { - TVGLOG("SW_ENGINE", "Direct Masked Rle Image"); + TVGLOG("SW_ENGINE", "Direct Matted(%d) Rle Image", (int)surface->compositor->method); auto span = image->rle->spans; auto csize = surface->compositor->image.channelSize; auto cbuffer = surface->compositor->image.buf8; + auto alpha = surface->blender.alpha(surface->compositor->method); for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { auto dst = &surface->buf32[span->y * surface->stride + span->x]; auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize]; auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); - if (span->coverage == 255) { + auto a = MULTIPLY(span->coverage, opacity); + if (a == 255) { for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) { auto tmp = ALPHA_BLEND(*img, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); } } else { for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) { - auto tmp = ALPHA_BLEND(*img, _multiply(span->coverage, alpha(cmp))); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + auto tmp = ALPHA_BLEND(*img, MULTIPLY(a, alpha(cmp))); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); } } } @@ -662,57 +909,37 @@ static bool _rasterDirectMaskedRleRGBAImage(SwSurface* surface, const SwImage* i } -static bool _rasterDirectTranslucentRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) +static bool _rasterDirectRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) { auto span = image->rle->spans; for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { auto dst = &surface->buf32[span->y * surface->stride + span->x]; auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); - auto alpha = _multiply(span->coverage, opacity); - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) { - auto src = ALPHA_BLEND(*img, alpha); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto alpha = MULTIPLY(span->coverage, opacity); + if (alpha == 255) { + *dst = *img + ALPHA_BLEND(*dst, IALPHA(*img)); + } else { + for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) { + auto src = ALPHA_BLEND(*img, alpha); + *dst = src + ALPHA_BLEND(*dst, IALPHA(src)); + } } } return true; } -static bool _rasterDirectRleRGBAImage(SwSurface* surface, const SwImage* image) +static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) { - auto span = image->rle->spans; - - for (uint32_t i = 0; i < image->rle->size; ++i, ++span) { - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox); - if (span->coverage == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) { - *dst = *img + ALPHA_BLEND(*dst, _ialpha(*img)); - } - } else { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) { - auto src = ALPHA_BLEND(*img, span->coverage); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); - } - } - } - return true; -} - - -static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32_t opacity) -{ - if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - if (opacity == 255) return _rasterDirectMaskedRleRGBAImage(surface, image, alpha); - else return _rasterDirectMaskedTranslucentRleRGBAImage(surface, image, opacity, alpha); - } else { - if (opacity == 255) return _rasterDirectRleRGBAImage(surface, image); - else return _rasterDirectTranslucentRleRGBAImage(surface, image, opacity); - } - return false; -} + if (_compositing(surface)) { + if (_matting(surface)) return _rasterDirectMattedRleRGBAImage(surface, image, opacity); + else return _rasterDirectMaskedRleRGBAImage(surface, image, opacity); + } else { + return _rasterDirectRleRGBAImage(surface, image, opacity); + } + return false; +} /************************************************************************/ @@ -721,24 +948,21 @@ static bool _directRleRGBAImage(SwSurface* surface, const SwImage* image, uint32 static bool _transformedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity) { - if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterTexmapPolygon(surface, image, transform, ®ion, opacity, alpha); - } else { - return _rasterTexmapPolygon(surface, image, transform, ®ion, opacity, nullptr); + auto ret = _rasterTexmapPolygon(surface, image, transform, ®ion, opacity); + + //Masking Composition + if (_compositing(surface) && _masking(surface)) { + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); } - return false; + + return ret; } + static bool _transformedRGBAImageMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity) { - if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity, alpha); - } else { - return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity, nullptr); - } - return false; + //TODO: Not completed for all cases. + return _rasterTexmapPolygonMesh(surface, image, mesh, transform, region, opacity); } @@ -746,161 +970,169 @@ static bool _transformedRGBAImageMesh(SwSurface* surface, const SwImage* image, /*Scaled RGBA Image */ /************************************************************************/ -static bool _rasterScaledMaskedTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale, SwAlpha alpha) +template +static void _rasterScaledMaskedRGBAImageDup(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { - TVGLOG("SW_ENGINE", "Scaled Masked Image"); + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x); - auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x); - auto csize = surface->compositor->image.channelSize; - auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; - - // Down-Scaled - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (auto y = region.min.y; y < region.max.y; ++y) { - auto sy = (uint32_t)(y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = dbuffer; - auto cmp = cbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto a = _multiply(opacity, alpha(cmp)); - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), a); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + for (auto y = region.min.y; y < region.max.y; ++y) { + auto sy = y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto cmp = cbuffer; + if (opacity == 255) { + for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *cmp = maskOp()(src, *cmp, IALPHA(src)); } - dbuffer += surface->stride; - cbuffer += surface->compositor->image.stride * csize; - } - // Up-Scaled - } else { - for (auto y = region.min.y; y < region.max.y; ++y) { - auto sy = y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = dbuffer; - auto cmp = cbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { + } else { + for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) { auto sx = x * itransform->e11 + itransform->e13; if ((uint32_t)sx >= image->w) continue; - auto a = _multiply(opacity, alpha(cmp)); - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), a); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *cmp = amaskOp()(src, *cmp, opacity); } - dbuffer += surface->stride; - cbuffer += surface->compositor->image.stride * csize; } + cbuffer += cstride; } - return true; } - -static bool _rasterScaledMaskedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale, SwAlpha alpha) +static void _rasterScaledMaskedRGBAImageInt(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { - TVGLOG("SW_ENGINE", "Scaled Masked Image"); - - auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x); - auto csize = surface->compositor->image.channelSize; - auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; - - // Down-Scaled - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (auto y = region.min.y; y < region.max.y; ++y) { - auto sy = (uint32_t)(y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = dbuffer; - auto cmp = cbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), alpha(cmp)); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; + auto h = static_cast(region.max.y - region.min.y); + auto w = static_cast(region.max.x - region.min.x); + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32 + (surface->compositor->bbox.min.y * cstride + surface->compositor->bbox.min.x); + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + if (y == region.min.y) { + auto cbuffer2 = cbuffer; + for (uint32_t y2 = y; y2 < region.max.y; ++y2) { + auto sy = y2 * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto tmp = cbuffer2; + auto x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (x == region.min.x) { + if (opacity == 255) { + for (uint32_t i = 0; i < w; ++i, ++tmp) { + auto sx = (x + i) * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *tmp = ALPHA_BLEND(*tmp, ALPHA(src)); + } + } else { + for (uint32_t i = 0; i < w; ++i, ++tmp) { + auto sx = (x + i) * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity); + *tmp = ALPHA_BLEND(*tmp, ALPHA(src)); + } + } + x += w; + } else { + *tmp = 0; + ++tmp; + ++x; + } + } + cbuffer2 += cstride; } - dbuffer += surface->stride; - cbuffer += surface->compositor->image.stride * csize; - } - // Up-Scaled - } else { - for (auto y = region.min.y; y < region.max.y; ++y) { - auto sy = y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = dbuffer; - auto cmp = cbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { - auto sx = x * itransform->e11 + itransform->e13; - if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), alpha(cmp)); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + y += (h - 1); + } else { + auto tmp = cbuffer; + for (uint32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x, ++tmp) { + *tmp = 0; } - dbuffer += surface->stride; - cbuffer += surface->compositor->image.stride * csize; } + cbuffer += cstride; } - return true; } -static bool _rasterScaledTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) +static bool _rasterScaledMaskedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) +{ + auto method = surface->compositor->method; + + TVGLOG("SW_ENGINE", "Scaled Masked(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y); + + if (method == CompositeMethod::AddMask) _rasterScaledMaskedRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::SubtractMask) _rasterScaledMaskedRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::DifferenceMask) _rasterScaledMaskedRGBAImageDup(surface, image, itransform, region, opacity, halfScale); + else if (method == CompositeMethod::IntersectMask) _rasterScaledMaskedRGBAImageInt(surface, image, itransform, region, opacity, halfScale); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterScaledMattedRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x); + auto csize = surface->compositor->image.channelSize; + auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; + auto alpha = surface->blender.alpha(surface->compositor->method); - // Down-Scaled - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) { - auto sy = (uint32_t)(y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = dbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + TVGLOG("SW_ENGINE", "Scaled Matted(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y); + + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; + + for (auto y = region.min.y; y < region.max.y; ++y) { + auto sy = y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto dst = dbuffer; + auto cmp = cbuffer; + if (opacity == 255) { + for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + auto temp = ALPHA_BLEND(src, alpha(cmp)); + *dst = temp + ALPHA_BLEND(*dst, IALPHA(temp)); } - } - // Up-Scaled - } else { - for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) { - auto sy = fabsf(y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = dbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst) { + } else { + for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) { auto sx = x * itransform->e11 + itransform->e13; if ((uint32_t)sx >= image->w) continue; - auto src = ALPHA_BLEND(_interpUpScaler(image->buf32, image->w, image->h, sx, sy), opacity); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + auto temp = ALPHA_BLEND(src, MULTIPLY(opacity, alpha(cmp))); + *dst = temp + ALPHA_BLEND(*dst, IALPHA(temp)); } } + dbuffer += surface->stride; + cbuffer += surface->compositor->image.stride * csize; } return true; } -static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t halfScale) +static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint32_t opacity, uint32_t halfScale) { auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x); + auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler; - // Down-Scaled - if (image->scale < DOWN_SCALE_TOLERANCE) { - for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) { - auto sy = (uint32_t)(y * itransform->e22 + itransform->e23); - if (sy >= image->h) continue; - auto dst = dbuffer; + for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) { + auto sy = y * itransform->e22 + itransform->e23; + if ((uint32_t)sy >= image->h) continue; + auto dst = dbuffer; + if (opacity == 255) { for (auto x = region.min.x; x < region.max.x; ++x, ++dst) { - auto sx = (uint32_t)(x * itransform->e11 + itransform->e13); - if (sx >= image->w) continue; - auto src = _interpDownScaler(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto sx = x * itransform->e11 + itransform->e13; + if ((uint32_t)sx >= image->w) continue; + auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale); + *dst = src + ALPHA_BLEND(*dst, IALPHA(src)); } - } - // Up-Scaled - } else { - for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) { - auto sy = y * itransform->e22 + itransform->e23; - if ((uint32_t)sy >= image->h) continue; - auto dst = dbuffer; + } else { for (auto x = region.min.x; x < region.max.x; ++x, ++dst) { auto sx = x * itransform->e11 + itransform->e13; if ((uint32_t)sx >= image->w) continue; - auto src = _interpUpScaler(image->buf32, image->w, image->h, sx, sy); - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, halfScale), opacity); + *dst = src + ALPHA_BLEND(*dst, IALPHA(src)); } } } @@ -919,12 +1151,10 @@ static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Mat auto halfScale = _halfScale(image->scale); if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - if (opacity == 255) return _rasterScaledMaskedRGBAImage(surface, image, &itransform, region, halfScale, alpha); - else return _rasterScaledMaskedTranslucentRGBAImage(surface, image, &itransform, region, opacity, halfScale, alpha); + if (_matting(surface)) return _rasterScaledMattedRGBAImage(surface, image, &itransform, region, opacity, halfScale); + else return _rasterScaledMaskedRGBAImage(surface, image, &itransform, region, opacity, halfScale); } else { - if (opacity == 255) return _rasterScaledRGBAImage(surface, image, &itransform, region, halfScale); - else return _rasterScaledTranslucentRGBAImage(surface, image, &itransform, region, opacity, halfScale); + return _rasterScaledRGBAImage(surface, image, &itransform, region, opacity, halfScale); } return false; } @@ -934,53 +1164,122 @@ static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Mat /* Direct RGBA Image */ /************************************************************************/ -static bool _rasterDirectMaskedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, SwAlpha alpha) +template +static void _rasterDirectMaskedRGBAImageDup(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) { - TVGLOG("SW_ENGINE", "Direct Masked Image"); - - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto h2 = static_cast(region.max.y - region.min.y); - auto w2 = static_cast(region.max.x - region.min.x); - auto csize = surface->compositor->image.channelSize; + auto h = static_cast(region.max.y - region.min.y); + auto w = static_cast(region.max.x - region.min.x); + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x); //compositor buffer auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox); - auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; //compositor buffer - for (uint32_t y = 0; y < h2; ++y) { - auto dst = buffer; + for (uint32_t y = 0; y < h; ++y) { auto cmp = cbuffer; auto src = sbuffer; - for (uint32_t x = 0; x < w2; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + if (opacity == 255) { + for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) { + *cmp = maskOp()(*src, *cmp, IALPHA(*src)); + } + } else { + for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) { + *cmp = amaskOp()(*src, *cmp, opacity); + } } - buffer += surface->stride; - cbuffer += surface->compositor->image.stride * csize; + cbuffer += cstride; sbuffer += image->stride; } - return true; } -static bool _rasterDirectMaskedTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity, SwAlpha alpha) +static void _rasterDirectMaskedRGBAImageInt(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) { - TVGLOG("SW_ENGINE", "Direct Masked Translucent Image"); + auto h = static_cast(region.max.y - region.min.y); + auto w = static_cast(region.max.x - region.min.x); + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32 + (surface->compositor->bbox.min.y * cstride + surface->compositor->bbox.min.x); + + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + if (y == region.min.y) { + auto cbuffer2 = cbuffer; + for (uint32_t y2 = y; y2 < region.max.y; ++y2) { + auto tmp = cbuffer2; + auto x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (x == region.min.x) { + auto src = &image->buf32[(y2 + image->oy) * image->stride + (x + image->ox)]; + if (opacity == 255) { + for (uint32_t i = 0; i < w; ++i, ++tmp, ++src) { + *tmp = ALPHA_BLEND(*tmp, ALPHA(*src)); + } + } else { + for (uint32_t i = 0; i < w; ++i, ++tmp, ++src) { + auto t = ALPHA_BLEND(*src, opacity); + *tmp = ALPHA_BLEND(*tmp, ALPHA(t)); + } + } + x += w; + } else { + *tmp = 0; + ++tmp; + ++x; + } + } + cbuffer2 += cstride; + } + y += (h - 1); + } else { + rasterRGBA32(cbuffer, 0x00000000, 0, surface->compositor->bbox.max.x - surface->compositor->bbox.min.x); + } + cbuffer += cstride; + } +} + +static bool _rasterDirectMaskedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) +{ + auto method = surface->compositor->method; + + TVGLOG("SW_ENGINE", "Direct Masked(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y); + + if (method == CompositeMethod::AddMask) _rasterDirectMaskedRGBAImageDup(surface, image, region, opacity); + else if (method == CompositeMethod::SubtractMask) _rasterDirectMaskedRGBAImageDup(surface, image, region, opacity); + else if (method == CompositeMethod::DifferenceMask) _rasterDirectMaskedRGBAImageDup(surface, image, region, opacity); + else if (method == CompositeMethod::IntersectMask) _rasterDirectMaskedRGBAImageInt(surface, image, region, opacity); + else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox); +} + + +static bool _rasterDirectMattedRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) +{ auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto h2 = static_cast(region.max.y - region.min.y); - auto w2 = static_cast(region.max.x - region.min.x); + auto h = static_cast(region.max.y - region.min.y); + auto w = static_cast(region.max.x - region.min.x); auto csize = surface->compositor->image.channelSize; + auto alpha = surface->blender.alpha(surface->compositor->method); + + TVGLOG("SW_ENGINE", "Direct Matted(%d) Image [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h); auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox); auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; //compositor buffer - for (uint32_t y = 0; y < h2; ++y) { + for (uint32_t y = 0; y < h; ++y) { auto dst = buffer; auto cmp = cbuffer; auto src = sbuffer; - for (uint32_t x = 0; x < w2; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, _multiply(opacity, alpha(cmp))); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + if (opacity == 255) { + for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) { + auto tmp = ALPHA_BLEND(*src, alpha(cmp)); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); + } + } else { + for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) { + auto tmp = ALPHA_BLEND(*src, MULTIPLY(opacity, alpha(cmp))); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); + } } buffer += surface->stride; cbuffer += surface->compositor->image.stride * csize; @@ -990,7 +1289,7 @@ static bool _rasterDirectMaskedTranslucentRGBAImage(SwSurface* surface, const Sw } -static bool _rasterDirectTranslucentRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) +static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) { auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x]; auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox); @@ -998,27 +1297,15 @@ static bool _rasterDirectTranslucentRGBAImage(SwSurface* surface, const SwImage* for (auto y = region.min.y; y < region.max.y; ++y) { auto dst = dbuffer; auto src = sbuffer; - for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) { - auto tmp = ALPHA_BLEND(*src, opacity); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - dbuffer += surface->stride; - sbuffer += image->stride; - } - return true; -} - - -static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region) -{ - auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x]; - auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox); - - for (auto y = region.min.y; y < region.max.y; ++y) { - auto dst = dbuffer; - auto src = sbuffer; - for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) { - *dst = *src + ALPHA_BLEND(*dst, _ialpha(*src)); + if (opacity == 255) { + for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) { + *dst = *src + ALPHA_BLEND(*dst, IALPHA(*src)); + } + } else { + for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) { + auto tmp = ALPHA_BLEND(*src, opacity); + *dst = tmp + ALPHA_BLEND(*dst, IALPHA(tmp)); + } } dbuffer += surface->stride; sbuffer += image->stride; @@ -1031,12 +1318,10 @@ static bool _rasterDirectRGBAImage(SwSurface* surface, const SwImage* image, con static bool _directRGBAImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint32_t opacity) { if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - if (opacity == 255) return _rasterDirectMaskedRGBAImage(surface, image, region, alpha); - else return _rasterDirectMaskedTranslucentRGBAImage(surface, image, region, opacity, alpha); + if (_matting(surface)) return _rasterDirectMattedRGBAImage(surface, image, region, opacity); + else return _rasterDirectMaskedRGBAImage(surface, image, region, opacity); } else { - if (opacity == 255) return _rasterDirectRGBAImage(surface, image, region); - else return _rasterDirectTranslucentRGBAImage(surface, image, region, opacity); + return _rasterDirectRGBAImage(surface, image, region, opacity); } return false; } @@ -1060,31 +1345,82 @@ static bool _rasterRGBAImage(SwSurface* surface, SwImage* image, const Matrix* t /************************************************************************/ -/* Rect Linear Gradient */ +/* Rect Gradient */ /************************************************************************/ -static bool _rasterLinearGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwAlpha alpha) +template +static bool _rasterGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) { - if (fill->linear.len < FLT_EPSILON) return false; + auto h = static_cast(region.max.y - region.min.y); + auto w = static_cast(region.max.x - region.min.x); + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x); + auto method = surface->compositor->method; + + TVGLOG("SW_ENGINE", "Masked(%d) Gradient [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h); + + if (method == CompositeMethod::AddMask) { + for (uint32_t y = 0; y < h; ++y) { + fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opAddMask, 255); + cbuffer += surface->stride; + } + } else if (method == CompositeMethod::SubtractMask) { + for (uint32_t y = 0; y < h; ++y) { + fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opSubMask, 255); + cbuffer += surface->stride; + } + } else if (method == CompositeMethod::IntersectMask) { + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = surface->compositor->image.buf32 + (y * cstride + surface->compositor->bbox.min.x); + if (y == region.min.y) { + for (uint32_t y2 = y; y2 < region.max.y; ++y2) { + auto tmp = cmp; + auto x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (x == region.min.x) { + fillMethod()(fill, tmp, y2, x, w, opIntMask, 255); + x += w; + tmp += w; + } else { + *tmp = 0; + ++tmp; + ++x; + } + } + cmp += cstride; + } + y += (h - 1); + } else { + rasterRGBA32(cmp, 0x00000000, 0, surface->compositor->bbox.max.x -surface->compositor->bbox.min.x); + cmp += cstride; + } + } + } else if (method == CompositeMethod::DifferenceMask) { + for (uint32_t y = 0; y < h; ++y) { + fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, opDifMask, 255); + cbuffer += surface->stride; + } + } else return false; + + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox, 255); +} + +template +static bool _rasterGradientMattedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) +{ auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; auto h = static_cast(region.max.y - region.min.y); auto w = static_cast(region.max.x - region.min.x); auto csize = surface->compositor->image.channelSize; auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; + auto alpha = surface->blender.alpha(surface->compositor->method); - auto sbuffer = static_cast(alloca(w * sizeof(uint32_t))); - if (!sbuffer) return false; + TVGLOG("SW_ENGINE", "Matted(%d) Gradient [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h); for (uint32_t y = 0; y < h; ++y) { - fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w); - auto dst = buffer; - auto cmp = cbuffer; - auto src = sbuffer; - for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } + fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, cbuffer, alpha, csize, 255); buffer += surface->stride; cbuffer += surface->stride * csize; } @@ -1092,39 +1428,30 @@ static bool _rasterLinearGradientMaskedRect(SwSurface* surface, const SwBBox& re } -static bool _rasterTranslucentLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) +template +static bool _rasterTranslucentGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) { - if (fill->linear.len < FLT_EPSILON) return false; - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; auto h = static_cast(region.max.y - region.min.y); auto w = static_cast(region.max.x - region.min.x); - auto sbuffer = static_cast(alloca(w * sizeof(uint32_t))); - if (!sbuffer) return false; - for (uint32_t y = 0; y < h; ++y) { - auto dst = buffer; - fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w); - for (uint32_t x = 0; x < w; ++x, ++dst) { - *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x])); - } + fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, opBlend, 255); buffer += surface->stride; } return true; } -static bool _rasterSolidLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) +template +static bool _rasterSolidGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) { - if (fill->linear.len < FLT_EPSILON) return false; - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; auto w = static_cast(region.max.x - region.min.x); auto h = static_cast(region.max.y - region.min.y); for (uint32_t y = 0; y < h; ++y) { - fillFetchLinear(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w); + fillMethod()(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w); } return true; } @@ -1132,300 +1459,158 @@ static bool _rasterSolidLinearGradientRect(SwSurface* surface, const SwBBox& reg static bool _rasterLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) { + if (fill->linear.len < FLT_EPSILON) return false; + if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterLinearGradientMaskedRect(surface, region, fill, alpha); + if (_matting(surface)) return _rasterGradientMattedRect(surface, region, fill); + else return _rasterGradientMaskedRect(surface, region, fill); } else { - if (fill->translucent) return _rasterTranslucentLinearGradientRect(surface, region, fill); - else _rasterSolidLinearGradientRect(surface, region, fill); + if (fill->translucent) return _rasterTranslucentGradientRect(surface, region, fill); + else _rasterSolidGradientRect(surface, region, fill); } return false; } -/************************************************************************/ -/* Rle Linear Gradient */ -/************************************************************************/ - -static bool _rasterLinearGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwAlpha alpha) -{ - if (fill->linear.len < FLT_EPSILON) return false; - - auto span = rle->spans; - auto csize = surface->compositor->image.channelSize; - auto cbuffer = surface->compositor->image.buf8; - auto buffer = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buffer) return false; - - for (uint32_t i = 0; i < rle->size; ++i, ++span) { - fillFetchLinear(fill, buffer, span->y, span->x, span->len); - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize]; - auto src = buffer; - if (span->coverage == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } else { - auto ialpha = 255 - span->coverage; - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - tmp = ALPHA_BLEND(tmp, span->coverage) + ALPHA_BLEND(*dst, ialpha); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } - } - return true; -} - - -static bool _rasterTranslucentLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) -{ - if (fill->linear.len < FLT_EPSILON) return false; - - auto span = rle->spans; - auto buffer = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buffer) return false; - - for (uint32_t i = 0; i < rle->size; ++i, ++span) { - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - fillFetchLinear(fill, buffer, span->y, span->x, span->len); - if (span->coverage == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x])); - } - } else { - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - auto tmp = ALPHA_BLEND(buffer[x], span->coverage); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } - } - return true; -} - - -static bool _rasterSolidLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) -{ - if (fill->linear.len < FLT_EPSILON) return false; - - auto buf = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buf) return false; - - auto span = rle->spans; - - for (uint32_t i = 0; i < rle->size; ++i, ++span) { - if (span->coverage == 255) { - fillFetchLinear(fill, surface->buf32 + span->y * surface->stride + span->x, span->y, span->x, span->len); - } else { - fillFetchLinear(fill, buf, span->y, span->x, span->len); - auto dst = &surface->buf32[span->y * surface->stride + span->x]; - for (uint32_t x = 0; x < span->len; ++x) { - dst[x] = INTERPOLATE(span->coverage, buf[x], dst[x]); - } - } - } - return true; -} - - -static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) +static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) { - if (!rle) return false; + if (fill->radial.a < FLT_EPSILON) return false; if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterLinearGradientMaskedRle(surface, rle, fill, alpha); + if (_matting(surface)) return _rasterGradientMattedRect(surface, region, fill); + else return _rasterGradientMaskedRect(surface, region, fill); } else { - if (fill->translucent) return _rasterTranslucentLinearGradientRle(surface, rle, fill); - else return _rasterSolidLinearGradientRle(surface, rle, fill); + if (fill->translucent) return _rasterTranslucentGradientRect(surface, region, fill); + else _rasterSolidGradientRect(surface, region, fill); } return false; } + /************************************************************************/ -/* Rect Radial Gradient */ +/* Rle Gradient */ /************************************************************************/ -static bool _rasterRadialGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwAlpha alpha) +template +static bool _rasterGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) { - if (fill->radial.a < FLT_EPSILON) return false; - - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto h = static_cast(region.max.y - region.min.y); - auto w = static_cast(region.max.x - region.min.x); - auto csize = surface->compositor->image.channelSize; - auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; + TVGLOG("SW_ENGINE", "Masked(%d) Rle Linear Gradient", (int)surface->compositor->method); - auto sbuffer = static_cast(alloca(w * sizeof(uint32_t))); - if (!sbuffer) return false; + auto span = rle->spans; + auto cstride = surface->compositor->image.stride; + auto cbuffer = surface->compositor->image.buf32; + auto method = surface->compositor->method; - for (uint32_t y = 0; y < h; ++y) { - fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w); - auto dst = buffer; - auto cmp = cbuffer; - auto src = sbuffer; - for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); + if (method == CompositeMethod::AddMask) { + for (uint32_t i = 0; i < rle->size; ++i, ++span) { + auto cmp = &cbuffer[span->y * cstride + span->x]; + fillMethod()(fill, cmp, span->y, span->x, span->len, opAddMask, span->coverage); } - buffer += surface->stride; - cbuffer += surface->stride * csize; - } - return true; -} - - -static bool _rasterTranslucentRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) -{ - if (fill->radial.a < FLT_EPSILON) return false; - - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto h = static_cast(region.max.y - region.min.y); - auto w = static_cast(region.max.x - region.min.x); - - auto sbuffer = static_cast(alloca(w * sizeof(uint32_t))); - if (!sbuffer) return false; - - for (uint32_t y = 0; y < h; ++y) { - auto dst = buffer; - fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w); - for (uint32_t x = 0; x < w; ++x, ++dst) { - *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x])); + } else if (method == CompositeMethod::SubtractMask) { + for (uint32_t i = 0; i < rle->size; ++i, ++span) { + auto cmp = &cbuffer[span->y * cstride + span->x]; + fillMethod()(fill, cmp, span->y, span->x, span->len, opSubMask, span->coverage); } - buffer += surface->stride; - } - return true; -} - - -static bool _rasterSolidRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) -{ - if (fill->radial.a < FLT_EPSILON) return false; - - auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto h = static_cast(region.max.y - region.min.y); - auto w = static_cast(region.max.x - region.min.x); - - for (uint32_t y = 0; y < h; ++y) { - auto dst = &buffer[y * surface->stride]; - fillFetchRadial(fill, dst, region.min.y + y, region.min.x, w); - } - return true; -} - + } else if (method == CompositeMethod::IntersectMask) { + for (uint32_t y = surface->compositor->bbox.min.y; y < surface->compositor->bbox.max.y; ++y) { + auto cmp = &cbuffer[y * cstride]; + uint32_t x = surface->compositor->bbox.min.x; + while (x < surface->compositor->bbox.max.x) { + if (y == span->y && x == span->x && x + span->len <= surface->compositor->bbox.max.x) { + fillMethod()(fill, cmp, span->y, span->x, span->len, opIntMask, span->coverage); + x += span->len; + ++span; + } else { + cmp[x] = 0; + ++x; + } + } + } + } else if (method == CompositeMethod::DifferenceMask) { + for (uint32_t i = 0; i < rle->size; ++i, ++span) { + auto cmp = &cbuffer[span->y * cstride + span->x]; + fillMethod()(fill, cmp, span->y, span->x, span->len, opDifMask, span->coverage); + } + } else return false; -static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill) -{ - if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterRadialGradientMaskedRect(surface, region, fill, alpha); - } else { - if (fill->translucent) return _rasterTranslucentRadialGradientRect(surface, region, fill); - else return _rasterSolidRadialGradientRect(surface, region, fill); - } - return false; + //Masking Composition + return _rasterDirectRGBAImage(surface, &surface->compositor->image, surface->compositor->bbox, 255); } -/************************************************************************/ -/* RLE Radial Gradient */ -/************************************************************************/ - -static bool _rasterRadialGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwAlpha alpha) +template +static bool _rasterGradientMattedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) { - if (fill->radial.a < FLT_EPSILON) return false; + TVGLOG("SW_ENGINE", "Matted(%d) Rle Linear Gradient", (int)surface->compositor->method); auto span = rle->spans; auto csize = surface->compositor->image.channelSize; auto cbuffer = surface->compositor->image.buf8; - auto buffer = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buffer) return false; + auto alpha = surface->blender.alpha(surface->compositor->method); for (uint32_t i = 0; i < rle->size; ++i, ++span) { - fillFetchRadial(fill, buffer, span->y, span->x, span->len); auto dst = &surface->buf32[span->y * surface->stride + span->x]; auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize]; - auto src = buffer; - if (span->coverage == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) { - auto tmp = ALPHA_BLEND(*src, alpha(cmp)); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } else { - for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) { - auto tmp = INTERPOLATE(span->coverage, ALPHA_BLEND(*src, alpha(cmp)), *dst); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } + fillMethod()(fill, dst, span->y, span->x, span->len, cmp, alpha, csize, span->coverage); } return true; } -static bool _rasterTranslucentRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) +template +static bool _rasterTranslucentGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) { - if (fill->radial.a < FLT_EPSILON) return false; - auto span = rle->spans; - auto buffer = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buffer) return false; for (uint32_t i = 0; i < rle->size; ++i, ++span) { auto dst = &surface->buf32[span->y * surface->stride + span->x]; - fillFetchRadial(fill, buffer, span->y, span->x, span->len); - if (span->coverage == 255) { - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x])); - } - } else { - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - auto tmp = ALPHA_BLEND(buffer[x], span->coverage); - *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp)); - } - } + if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len, opBlend, 255); + else fillMethod()(fill, dst, span->y, span->x, span->len, opAlphaBlend, span->coverage); } return true; } -static bool _rasterSolidRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) +template +static bool _rasterSolidGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) { - if (fill->radial.a < FLT_EPSILON) return false; - - auto buf = static_cast(alloca(surface->w * sizeof(uint32_t))); - if (!buf) return false; - auto span = rle->spans; for (uint32_t i = 0; i < rle->size; ++i, ++span) { auto dst = &surface->buf32[span->y * surface->stride + span->x]; - if (span->coverage == 255) { - fillFetchRadial(fill, dst, span->y, span->x, span->len); - } else { - fillFetchRadial(fill, buf, span->y, span->x, span->len); - auto ialpha = 255 - span->coverage; - for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = ALPHA_BLEND(buf[x], span->coverage) + ALPHA_BLEND(*dst, ialpha); - } - } + if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len); + else fillMethod()(fill, dst, span->y, span->x, span->len, opInterpolate, span->coverage); } return true; } +static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) +{ + if (!rle || fill->linear.len < FLT_EPSILON) return false; + + if (_compositing(surface)) { + if (_matting(surface)) return _rasterGradientMattedRle(surface, rle, fill); + else return _rasterGradientMaskedRle(surface, rle, fill); + } else { + if (fill->translucent) return _rasterTranslucentGradientRle(surface, rle, fill); + else return _rasterSolidGradientRle(surface, rle, fill); + } + return false; +} + + static bool _rasterRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill) { - if (!rle) return false; + if (!rle || fill->radial.a < FLT_EPSILON) return false; if (_compositing(surface)) { - auto alpha = surface->blender.alpha(surface->compositor->method); - return _rasterRadialGradientMaskedRle(surface, rle, fill, alpha); + if (_matting(surface)) return _rasterGradientMattedRle(surface, rle, fill); + else return _rasterGradientMaskedRle(surface, rle, fill); } else { - if (fill->translucent) _rasterTranslucentRadialGradientRle(surface, rle, fill); - else return _rasterSolidRadialGradientRle(surface, rle, fill); + if (fill->translucent) _rasterTranslucentGradientRle(surface, rle, fill); + else return _rasterSolidGradientRle(surface, rle, fill); } return false; } @@ -1449,8 +1634,8 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) bool rasterCompositor(SwSurface* surface) { //See CompositeMethod, Alpha:3, InvAlpha:4, Luma:5, InvLuma:6 - surface->blender.alphas[0] = _alpha; - surface->blender.alphas[1] = _ialpha; + surface->blender.alphas[0] = ALPHA; + surface->blender.alphas[1] = IALPHA; if (surface->cs == ColorSpace::ABGR8888 || surface->cs == ColorSpace::ABGR8888S) { surface->blender.join = _abgrJoin; @@ -1472,20 +1657,24 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_ { if (!surface || !surface->buf32 || surface->stride == 0 || surface->w == 0 || surface->h == 0) return false; - //full clear + //32 bits if (surface->channelSize == sizeof(uint32_t)) { + //full clear if (w == surface->stride) { rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h); + //partial clear } else { auto buffer = surface->buf32 + (surface->stride * y + x); for (uint32_t i = 0; i < h; i++) { rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w); } } - //partial clear + //8 bits } else if (surface->channelSize == sizeof(uint8_t)) { + //full clear if (w == surface->stride) { _rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h); + //partial clear } else { auto buffer = surface->buf8 + (surface->stride * y + x); for (uint32_t i = 0; i < h; i++) { @@ -1586,9 +1775,9 @@ bool rasterGradientStroke(SwSurface* surface, SwShape* shape, unsigned id) bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a) { if (a < 255) { - r = _multiply(r, a); - g = _multiply(g, a); - b = _multiply(b, a); + r = MULTIPLY(r, a); + g = MULTIPLY(g, a); + b = MULTIPLY(b, a); } if (shape->fastTrack) return _rasterRect(surface, shape->bbox, r, g, b, a); @@ -1599,9 +1788,9 @@ bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8 bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a) { if (a < 255) { - r = _multiply(r, a); - g = _multiply(g, a); - b = _multiply(b, a); + r = MULTIPLY(r, a); + g = MULTIPLY(g, a); + b = MULTIPLY(b, a); } return _rasterRle(surface, shape->strokeRle, r, g, b, a); diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h index 59a83ab06..a73768055 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterAvx.h @@ -148,7 +148,7 @@ static bool avxRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, ui if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage); else src = color; - auto ialpha = _ialpha(src); + auto ialpha = IALPHA(src); //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required) auto notAligned = ((uintptr_t)dst & 0xf) / 4; diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h index 18a096648..a040269ff 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterC.h @@ -40,8 +40,9 @@ static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRleData* rl auto dst = &surface->buf32[span->y * surface->stride + span->x]; if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage); else src = color; + auto ialpha = IALPHA(src); for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = src + ALPHA_BLEND(*dst, _ialpha(src)); + *dst = src + ALPHA_BLEND(*dst, ialpha); } } //8bit grayscale @@ -49,10 +50,11 @@ static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRleData* rl uint8_t src; for (uint32_t i = 0; i < rle->size; ++i, ++span) { auto dst = &surface->buf8[span->y * surface->stride + span->x]; - if (span->coverage < 255) src = _multiply(span->coverage, a); + if (span->coverage < 255) src = MULTIPLY(span->coverage, a); else src = a; + auto ialpha = ~a; for (uint32_t x = 0; x < span->len; ++x, ++dst) { - *dst = src + _multiply(*dst, ~src); + *dst = src + MULTIPLY(*dst, ialpha); } } } @@ -69,7 +71,7 @@ static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& regi if (surface->channelSize == sizeof(uint32_t)) { auto color = surface->blender.join(r, g, b, a); auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; - auto ialpha = _ialpha(color); + auto ialpha = IALPHA(color); for (uint32_t y = 0; y < h; ++y) { auto dst = &buffer[y * surface->stride]; for (uint32_t x = 0; x < w; ++x, ++dst) { @@ -79,10 +81,11 @@ static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& regi //8bit grayscale } else if (surface->channelSize == sizeof(uint8_t)) { auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x; + auto ialpha = ~a; for (uint32_t y = 0; y < h; ++y) { auto dst = &buffer[y * surface->stride]; for (uint32_t x = 0; x < w; ++x, ++dst) { - *dst = a + _multiply(*dst, ~a); + *dst = a + MULTIPLY(*dst, ialpha); } } } diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h new file mode 100644 index 000000000..0183b63cc --- /dev/null +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterMaskedTexmapInternal.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2023 the ThorVG project. All rights reserved. + + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef TEXMAP_INT_MASK +{ + float _dudx = dudx, _dvdx = dvdx; + float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya; + float _xa = xa, _xb = xb, _ua = ua, _va = va; + auto sbuf = image->buf32; + int32_t sw = static_cast(image->stride); + int32_t sh = image->h; + int32_t x1, x2, ar, ab, iru, irv, px, ay; + int32_t vv = 0, uu = 0; + int32_t minx = INT32_MAX, maxx = INT32_MIN; + float dx, u, v, iptr; + auto cbuffer = surface->compositor->image.buf32; + SwSpan* span = nullptr; //used only when rle based. + + if (!_arrange(image, region, yStart, yEnd)) return; + + //Clear out of the Polygon vertical ranges + auto size = surface->compositor->bbox.max.x - surface->compositor->bbox.min.x; + if (dirFlag == 1) { //left top case. + for(int y = surface->compositor->bbox.min.y; y < yStart; ++y) { + rasterRGBA32(surface->compositor->image.buf32 + y * surface->compositor->image.stride, 0, surface->compositor->bbox.min.x, size); + } + } + if (dirFlag == 4) { //right bottom case. + for(int y = yEnd; y < surface->compositor->bbox.max.y; ++y) { + rasterRGBA32(surface->compositor->image.buf32 + y * surface->compositor->image.stride, 0, surface->compositor->bbox.min.x, size); + } + } + + //Loop through all lines in the segment + uint32_t spanIdx = 0; + + if (region) { + minx = region->min.x; + maxx = region->max.x; + } else { + span = image->rle->spans; + while (span->y < yStart) { + ++span; + ++spanIdx; + } + } + + for (int32_t y = yStart; y < yEnd; ++y) { + auto cmp = &cbuffer[y * surface->compositor->image.stride]; + x1 = (int32_t)_xa; + x2 = (int32_t)_xb; + + if (!region) { + minx = INT32_MAX; + maxx = INT32_MIN; + //one single row, could be consisted of multiple spans. + while (span->y == y && spanIdx < image->rle->size) { + if (minx > span->x) minx = span->x; + if (maxx < span->x + span->len) maxx = span->x + span->len; + ++span; + ++spanIdx; + } + } + + if (x1 < minx) x1 = minx; + if (x2 > maxx) x2 = maxx; + + //Anti-Aliasing frames + //FIXME: this aa must be applied before masking op + ay = y - aaSpans->yStart; + if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1; + if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2; + + //Range allowed + if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) { + for (int32_t x = surface->compositor->bbox.min.x; x < surface->compositor->bbox.max.x; ++x) { + //Range allowed + if (x >= x1 && x < x2) { + //Perform subtexel pre-stepping on UV + dx = 1 - (_xa - x1); + u = _ua + dx * _dudx; + v = _va + dx * _dvdx; + if ((uint32_t)v >= image->h) { + cmp[x] = 0; + } else { + if (opacity == 255) { + uu = (int) u; + vv = (int) v; + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } + cmp[x] = ALPHA_BLEND(cmp[x], ALPHA(px)); + + //Step UV horizontally + u += _dudx; + v += _dvdx; + } else { + uu = (int) u; + vv = (int) v; + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } + cmp[x] = ALPHA_BLEND(cmp[x], MULTIPLY(ALPHA(px), opacity)); + + //Step UV horizontally + u += _dudx; + v += _dvdx; + } + } + } else { + //Clear out of polygon horizontal range + if (x < x1 && (dirFlag == 1 || dirFlag == 2)) cmp[x] = 0; + else if (x >= x2 && (dirFlag == 3 || dirFlag == 4)) cmp[x] = 0; + } + } + } + //Step along both edges + _xa += _dxdya; + _xb += _dxdyb; + _ua += _dudya; + _va += _dvdya; + } + xa = _xa; + xb = _xb; + ua = _ua; + va = _va; +} +#else +{ + float _dudx = dudx, _dvdx = dvdx; + float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya; + float _xa = xa, _xb = xb, _ua = ua, _va = va; + auto sbuf = image->buf32; + int32_t sw = static_cast(image->stride); + int32_t sh = image->h; + int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay; + int32_t vv = 0, uu = 0; + int32_t minx = INT32_MAX, maxx = INT32_MIN; + float dx, u, v, iptr; + SwSpan* span = nullptr; //used only when rle based. + + if (!_arrange(image, region, yStart, yEnd)) return; + + //Loop through all lines in the segment + uint32_t spanIdx = 0; + + if (region) { + minx = region->min.x; + maxx = region->max.x; + } else { + span = image->rle->spans; + while (span->y < yStart) { + ++span; + ++spanIdx; + } + } + + y = yStart; + + while (y < yEnd) { + x1 = (int32_t)_xa; + x2 = (int32_t)_xb; + + if (!region) { + minx = INT32_MAX; + maxx = INT32_MIN; + //one single row, could be consisted of multiple spans. + while (span->y == y && spanIdx < image->rle->size) { + if (minx > span->x) minx = span->x; + if (maxx < span->x + span->len) maxx = span->x + span->len; + ++span; + ++spanIdx; + } + } + if (x1 < minx) x1 = minx; + if (x2 > maxx) x2 = maxx; + + //Anti-Aliasing frames + ay = y - aaSpans->yStart; + if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1; + if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2; + + //Range allowed + if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) { + + //Perform subtexel pre-stepping on UV + dx = 1 - (_xa - x1); + u = _ua + dx * _dudx; + v = _va + dx * _dvdx; + + x = x1; + + auto cmp = &surface->compositor->image.buf32[y * surface->compositor->image.stride + x1]; + + if (opacity == 255) { + //Draw horizontal line + while (x++ < x2) { + uu = (int) u; + vv = (int) v; + + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } +#ifdef TEXMAP_ADD_MASK + *cmp = px + ALPHA_BLEND(*cmp, IALPHA(px)); +#elif defined(TEXMAP_SUB_MASK) + *cmp = ALPHA_BLEND(*cmp, IALPHA(px)); +#elif defined(TEXMAP_DIF_MASK) + *cmp = ALPHA_BLEND(px, IALPHA(*cmp)) + ALPHA_BLEND(*cmp, IALPHA(px)); +#endif + ++cmp; + + //Step UV horizontally + u += _dudx; + v += _dvdx; + //range over? + if ((uint32_t)v >= image->h) break; + } + } else { + //Draw horizontal line + while (x++ < x2) { + uu = (int) u; + vv = (int) v; + + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } +#ifdef TEXMAP_ADD_MASK + *cmp = INTERPOLATE(px, *cmp, opacity); +#elif defined(TEXMAP_SUB_MASK) + *cmp = ALPHA_BLEND(*cmp, IALPHA(ALPHA_BLEND(px, opacity))); +#elif defined(TEXMAP_DIF_MASK) + auto src = ALPHA_BLEND(px, opacity); + *cmp = ALPHA_BLEND(src, IALPHA(*cmp)) + ALPHA_BLEND(*cmp, IALPHA(src)); +#endif + ++cmp; + + //Step UV horizontally + u += _dudx; + v += _dvdx; + //range over? + if ((uint32_t)v >= image->h) break; + } + } + } + + //Step along both edges + _xa += _dxdya; + _xb += _dxdyb; + _ua += _dudya; + _va += _dvdya; + + if (!region && spanIdx >= image->rle->size) break; + + ++y; + } + xa = _xa; + xb = _xb; + ua = _ua; + va = _va; +} +#endif \ No newline at end of file diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h index 5bed2f85a..0b581fbd6 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterNeon.h @@ -67,7 +67,7 @@ static bool neonRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, u else src = color; auto dst = &surface->buf32[span->y * surface->stride + span->x]; - auto ialpha = 255 - _alpha(src); + auto ialpha = IALPHA(src); if ((((size_t) dst) & 0x7) != 0) { //fill not aligned byte @@ -105,7 +105,7 @@ static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region, auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x; auto h = static_cast(region.max.y - region.min.y); auto w = static_cast(region.max.x - region.min.x); - auto ialpha = 255 - _alpha(color); + auto ialpha = IALPHA(color); auto vColor = vdup_n_u32(color); auto vIalpha = vdup_n_u8((uint8_t) ialpha); diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h index 2a04ba742..7541b1d6f 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmap.h @@ -69,40 +69,46 @@ static bool _arrange(const SwImage* image, const SwBBox* region, int& yStart, in } -static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, SwAlpha alpha, AASpans* aaSpans) +static void _rasterMaskedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans, uint8_t dirFlag = 0) { -#define TEXMAP_TRANSLUCENT -#define TEXMAP_MASKING - #include "tvgSwRasterTexmapInternal.h" -#undef TEXMAP_MASKING -#undef TEXMAP_TRANSLUCENT + auto method = surface->compositor->method; + + if (method == CompositeMethod::AddMask) { + #define TEXMAP_ADD_MASK + #include "tvgSwRasterMaskedTexmapInternal.h" + #undef TEXMAP_ADD_MASK + } else if (method == CompositeMethod::SubtractMask) { + #define TEXMAP_SUB_MASK + #include "tvgSwRasterMaskedTexmapInternal.h" + #undef TEXMAP_SUB_MASK + } else if (method == CompositeMethod::IntersectMask) { + #define TEXMAP_INT_MASK + #include "tvgSwRasterMaskedTexmapInternal.h" + #undef TEXMAP_INT_MASK + } else if (method == CompositeMethod::DifferenceMask) { + #define TEXMAP_DIF_MASK + #include "tvgSwRasterMaskedTexmapInternal.h" + #undef TEXMAP_DIF_MASK + } } -static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, SwAlpha alpha, AASpans* aaSpans) +static void _rasterMattedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans) { -#define TEXMAP_MASKING - #include "tvgSwRasterTexmapInternal.h" -#undef TEXMAP_MASKING +#define TEXMAP_MATTING + #include "tvgSwRasterTexmapInternal.h" +#undef TEXMAP_MATTING } static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, uint32_t opacity, AASpans* aaSpans) -{ -#define TEXMAP_TRANSLUCENT - #include "tvgSwRasterTexmapInternal.h" -#undef TEXMAP_TRANSLUCENT -} - - -static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans) { #include "tvgSwRasterTexmapInternal.h" } /* This mapping algorithm is based on Mikael Kalms's. */ -static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, uint32_t opacity, Polygon& polygon, SwAlpha alpha, AASpans* aaSpans) +static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, uint32_t opacity, Polygon& polygon, AASpans* aaSpans) { float x[3] = {polygon.vertex[0].pt.x, polygon.vertex[1].pt.x, polygon.vertex[2].pt.x}; float y[3] = {polygon.vertex[0].pt.y, polygon.vertex[1].pt.y, polygon.vertex[2].pt.y}; @@ -165,6 +171,7 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const if (mathEqual(y[1], y[2])) side = x[2] > x[1]; auto regionTop = region ? region->min.y : image->rle->spans->y; //Normal Image or Rle Image? + auto compositing = _compositing(surface); //Composition required //Longer edge is on the left side if (!side) { @@ -190,13 +197,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const dxdyb = dxdy[0]; xb = x[0] + dy * dxdyb + (off_y * dxdyb); - if (alpha) { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], alpha, aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, alpha, aaSpans); - } else { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); - } + if (compositing) { + if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); + else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans, 1); + } else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); upper = true; } @@ -211,13 +215,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const // Set right edge X-slope and perform subpixel pre-stepping dxdyb = dxdy[2]; xb = x[1] + (1 - (y[1] - yi[1])) * dxdyb + (off_y * dxdyb); - if (alpha) { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], alpha, aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, alpha, aaSpans); - } else { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); - } + if (compositing) { + if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); + else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans, 2); + } else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); } //Longer edge is on the right side } else { @@ -240,13 +241,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const ua = u[0] + dy * dudya + (off_y * dudya); va = v[0] + dy * dvdya + (off_y * dvdya); - if (alpha) { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], alpha, aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, alpha, aaSpans); - } else { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); - } + if (compositing) { + if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); + else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans, 3); + } else _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], opacity, aaSpans); upper = true; } @@ -264,13 +262,10 @@ static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const ua = u[1] + dy * dudya + (off_y * dudya); va = v[1] + dy * dvdya + (off_y * dvdya); - if (alpha) { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], alpha, aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, alpha, aaSpans); - } else { - if (opacity == 255) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans); - else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); - } + if (compositing) { + if (_matting(surface)) _rasterMattedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); + else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans, 4); + } else _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], opacity, aaSpans); } } } @@ -508,7 +503,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans) pos = 1; while (pos <= line->length[0]) { - *dst = INTERPOLATE((line->coverage[0] * pos), *dst, pixel); + *dst = INTERPOLATE(*dst, pixel, line->coverage[0] * pos); ++dst; ++pos; } @@ -520,7 +515,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans) pos = width; while ((int32_t)(width - line->length[1]) < pos) { - *dst = INTERPOLATE(255 - (line->coverage[1] * (line->length[1] - (width - pos))), *dst, pixel); + *dst = INTERPOLATE(*dst, pixel, 255 - (line->coverage[1] * (line->length[1] - (width - pos)))); --dst; --pos; } @@ -545,7 +540,7 @@ static bool _apply(SwSurface* surface, AASpans* aaSpans) | / | 3 -- 2 */ -static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox* region, uint32_t opacity, SwAlpha alpha) +static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox* region, uint32_t opacity) { //Exceptions: No dedicated drawing area? if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false; @@ -576,14 +571,14 @@ static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const polygon.vertex[1] = vertices[1]; polygon.vertex[2] = vertices[3]; - _rasterPolygonImage(surface, image, region, opacity, polygon, alpha, aaSpans); + _rasterPolygonImage(surface, image, region, opacity, polygon, aaSpans); //Draw the second polygon polygon.vertex[0] = vertices[1]; polygon.vertex[1] = vertices[2]; polygon.vertex[2] = vertices[3]; - _rasterPolygonImage(surface, image, region, opacity, polygon, alpha, aaSpans); + _rasterPolygonImage(surface, image, region, opacity, polygon, aaSpans); return _apply(surface, aaSpans); } @@ -602,7 +597,7 @@ static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Should provide two Polygons, one for each triangle. // TODO: region? */ -static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity, SwAlpha alpha) +static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint32_t opacity) { //Exceptions: No dedicated drawing area? if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false; @@ -636,7 +631,7 @@ static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, c auto aaSpans = _AASpans(ys, ye, image, region); if (aaSpans) { for (uint32_t i = 0; i < mesh->triangleCnt; i++) { - _rasterPolygonImage(surface, image, region, opacity, transformedTris[i], alpha, aaSpans); + _rasterPolygonImage(surface, image, region, opacity, transformedTris[i], aaSpans); } // Apply to surface (note: frees the AA spans) _apply(surface, aaSpans); diff --git a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h index 92e8c1b35..bfa7db218 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h @@ -36,9 +36,9 @@ uint32_t* buf; SwSpan* span = nullptr; //used only when rle based. -#ifdef TEXMAP_MASKING - uint8_t* cmp; +#ifdef TEXMAP_MATTING auto csize = surface->compositor->image.channelSize; + auto alpha = surface->blender.alpha(surface->compositor->method); #endif if (!_arrange(image, region, yStart, yEnd)) return; @@ -82,75 +82,121 @@ if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1; if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2; - //Range exception - if ((x2 - x1) < 1 || (x1 >= maxx) || (x2 <= minx)) goto next; + //Range allowed + if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) { - //Perform subtexel pre-stepping on UV - dx = 1 - (_xa - x1); - u = _ua + dx * _dudx; - v = _va + dx * _dvdx; + //Perform subtexel pre-stepping on UV + dx = 1 - (_xa - x1); + u = _ua + dx * _dudx; + v = _va + dx * _dvdx; - buf = dbuf + ((y * dw) + x1); + buf = dbuf + ((y * dw) + x1); - x = x1; + x = x1; -#ifdef TEXMAP_MASKING - cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize]; +#ifdef TEXMAP_MATTING + auto cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize]; #endif - //Draw horizontal line - while (x++ < x2) { - uu = (int) u; - vv = (int) v; - - ar = (int)(255 * (1 - modff(u, &iptr))); - ab = (int)(255 * (1 - modff(v, &iptr))); - iru = uu + 1; - irv = vv + 1; - - if (vv >= sh) continue; - - px = *(sbuf + (vv * sw) + uu); - - /* horizontal interpolate */ - if (iru < sw) { - /* right pixel */ - int px2 = *(sbuf + (vv * sw) + iru); - px = INTERPOLATE(ar, px, px2); - } - /* vertical interpolate */ - if (irv < sh) { - /* bottom pixel */ - int px2 = *(sbuf + (irv * sw) + uu); - - /* horizontal interpolate */ - if (iru < sw) { - /* bottom right pixel */ - int px3 = *(sbuf + (irv * sw) + iru); - px2 = INTERPOLATE(ar, px2, px3); - } - px = INTERPOLATE(ab, px, px2); - } -#if defined(TEXMAP_MASKING) && defined(TEXMAP_TRANSLUCENT) - auto src = ALPHA_BLEND(px, _multiply(opacity, alpha(cmp))); -#elif defined(TEXMAP_MASKING) - auto src = ALPHA_BLEND(px, alpha(cmp)); -#elif defined(TEXMAP_TRANSLUCENT) - auto src = ALPHA_BLEND(px, opacity); + if (opacity == 255) { + //Draw horizontal line + while (x++ < x2) { + uu = (int) u; + vv = (int) v; + + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } +#ifdef TEXMAP_MATTING + auto src = ALPHA_BLEND(px, alpha(cmp)); + cmp += csize; #else - auto src = px; + auto src = px; #endif - *buf = src + ALPHA_BLEND(*buf, _ialpha(src)); - ++buf; -#ifdef TEXMAP_MASKING - cmp += csize; + *buf = src + ALPHA_BLEND(*buf, IALPHA(src)); + ++buf; + + //Step UV horizontally + u += _dudx; + v += _dvdx; + //range over? + if ((uint32_t)v >= image->h) break; + } + } else { + //Draw horizontal line + while (x++ < x2) { + uu = (int) u; + vv = (int) v; + + ar = (int)(255 * (1 - modff(u, &iptr))); + ab = (int)(255 * (1 - modff(v, &iptr))); + iru = uu + 1; + irv = vv + 1; + + if (vv >= sh) continue; + + px = *(sbuf + (vv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* right pixel */ + int px2 = *(sbuf + (vv * sw) + iru); + px = INTERPOLATE(px, px2, ar); + } + /* vertical interpolate */ + if (irv < sh) { + /* bottom pixel */ + int px2 = *(sbuf + (irv * sw) + uu); + + /* horizontal interpolate */ + if (iru < sw) { + /* bottom right pixel */ + int px3 = *(sbuf + (irv * sw) + iru); + px2 = INTERPOLATE(px2, px3, ar); + } + px = INTERPOLATE(px, px2, ab); + } +#ifdef TEXMAP_MATTING + auto src = ALPHA_BLEND(px, MULTIPLY(opacity, alpha(cmp))); + cmp += csize; +#else + auto src = ALPHA_BLEND(px, opacity); #endif - //Step UV horizontally - u += _dudx; - v += _dvdx; - //range over? - if ((uint32_t)v >= image->h) break; + *buf = src + ALPHA_BLEND(*buf, IALPHA(src)); + ++buf; + + //Step UV horizontally + u += _dudx; + v += _dvdx; + //range over? + if ((uint32_t)v >= image->h) break; + } + } } -next: //Step along both edges _xa += _dxdya; _xb += _dxdyb; diff --git a/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp b/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp index 5746fe49a..450a221d6 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp +++ b/libfenrir/src/main/jni/thorvg/src/lib/tvgPaint.cpp @@ -166,6 +166,7 @@ bool Paint::Impl::render(RenderMethod& renderer) Create a composition image. */ if (compData && compData->method != CompositeMethod::ClipPath && !(compData->target->pImpl->ctxFlag & ContextFlag::FastTrack)) { auto region = smethod->bounds(renderer); + if (MASK_OPERATION(compData->method)) region.add(compData->target->pImpl->smethod->bounds(renderer)); if (region.w == 0 || region.h == 0) return true; cmp = renderer.target(region, COMPOSITE_TO_COLORSPACE(renderer, compData->method)); if (renderer.beginComposite(cmp, CompositeMethod::None, 255)) { @@ -209,11 +210,18 @@ RenderData Paint::Impl::update(RenderMethod& renderer, const RenderTransform* pT auto tryFastTrack = false; if (target->identifier() == TVG_CLASS_ID_SHAPE) { if (method == CompositeMethod::ClipPath) tryFastTrack = true; + //OPTIMIZE HERE: Actually, this condition AlphaMask is useless. We can skip it? else if (method == CompositeMethod::AlphaMask) { auto shape = static_cast(target); uint8_t a; shape->fillColor(nullptr, nullptr, nullptr, &a); if (a == 255 && shape->opacity() == 255 && !shape->fill()) tryFastTrack = true; + //OPTIMIZE HERE: Actually, this condition InvAlphaMask is useless. We can skip it? + } else if (method == CompositeMethod::InvAlphaMask) { + auto shape = static_cast(target); + uint8_t a; + shape->fillColor(nullptr, nullptr, nullptr, &a); + if ((a == 0 || shape->opacity() == 0) && !shape->fill()) tryFastTrack = true; } if (tryFastTrack) { RenderRegion viewport2; diff --git a/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h b/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h index e4445c579..62be27df9 100644 --- a/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h +++ b/libfenrir/src/main/jni/thorvg/src/lib/tvgRender.h @@ -98,6 +98,20 @@ struct RenderRegion if (w < 0) w = 0; if (h < 0) h = 0; } + + void add(const RenderRegion& rhs) + { + if (rhs.x < x) { + w += (x - rhs.x); + x = rhs.x; + } + if (rhs.y < y) { + h += (y - rhs.y); + y = rhs.y; + } + if (rhs.x + rhs.w > x + w) w = (rhs.x + rhs.w) - x; + if (rhs.y + rhs.h > y + h) h = (rhs.y + rhs.h) - y; + } }; struct RenderTransform @@ -238,6 +252,25 @@ class RenderMethod virtual bool endComposite(Compositor* cmp) = 0; }; +static inline bool MASK_OPERATION(CompositeMethod method) +{ + switch(method) { + case CompositeMethod::AlphaMask: + case CompositeMethod::InvAlphaMask: + case CompositeMethod::LumaMask: + case CompositeMethod::InvLumaMask: + return false; + case CompositeMethod::AddMask: + case CompositeMethod::SubtractMask: + case CompositeMethod::IntersectMask: + case CompositeMethod::DifferenceMask: + return true; + default: + TVGERR("COMMON", "Unsupported Composite Size! = %d", (int)method); + return false; + } +} + static inline uint8_t CHANNEL_SIZE(ColorSpace cs) { switch(cs) { @@ -263,6 +296,10 @@ static inline ColorSpace COMPOSITE_TO_COLORSPACE(RenderMethod& renderer, Composi return ColorSpace::Grayscale8; case CompositeMethod::LumaMask: case CompositeMethod::InvLumaMask: + case CompositeMethod::AddMask: + case CompositeMethod::SubtractMask: + case CompositeMethod::IntersectMask: + case CompositeMethod::DifferenceMask: return renderer.colorSpace(); default: TVGERR("COMMON", "Unsupported Composite Size! = %d", (int)method); diff --git a/other_tool/getter-setter-fix-plugin-gradle/build.gradle b/other_tool/getter-setter-fix-plugin-gradle/build.gradle index 14a7553ed..4d31a3174 100644 --- a/other_tool/getter-setter-fix-plugin-gradle/build.gradle +++ b/other_tool/getter-setter-fix-plugin-gradle/build.gradle @@ -1,6 +1,6 @@ plugins { id("java") - id("kotlin") + id("org.jetbrains.kotlin.jvm") version "1.8.21" id("org.jetbrains.intellij") version "1.9.0" } @@ -14,7 +14,7 @@ repositories { } intellij { - version = "2022.2.1" + version = "2022.3.1" type = "IC" plugins = ['android'] } diff --git a/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip b/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip index 610667b53..9e2d0961d 100644 Binary files a/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip and b/other_tool/getter-setter-fix-plugin-gradle/getter-setter-fix-plugin-gradle-1.0.zip differ diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle.properties b/other_tool/getter-setter-fix-plugin-gradle/gradle.properties new file mode 100644 index 000000000..21554ff12 --- /dev/null +++ b/other_tool/getter-setter-fix-plugin-gradle/gradle.properties @@ -0,0 +1,14 @@ +## For more details on how to configure your build environment visit +# http://www.gradle.org/docs/current/userguide/build_environment.html +# +# Specifies the JVM arguments used for the daemon process. +# The setting is particularly useful for tweaking memory settings. +# Default value: -Xmx1024m -XX:MaxPermSize=256m +# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 +# +# When configured, Gradle will run in incubating parallel mode. +# This option should only be used with decoupled projects. More details, visit +# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects +# org.gradle.parallel=true +#Tue May 30 17:05:45 MSK 2023 +org.gradle.jvmargs=-Xmx2048M -Dkotlin.daemon.jvm.options\="-Xmx2048M" diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 000000000..249e5832f Binary files /dev/null and b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.jar differ diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 000000000..aba097b06 --- /dev/null +++ b/other_tool/getter-setter-fix-plugin-gradle/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,5 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.1-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists \ No newline at end of file diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradlew b/other_tool/getter-setter-fix-plugin-gradle/gradlew new file mode 100644 index 000000000..a69d9cb6c --- /dev/null +++ b/other_tool/getter-setter-fix-plugin-gradle/gradlew @@ -0,0 +1,240 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit + +APP_NAME="Gradle" +APP_BASE_NAME=${0##*/} + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat b/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat new file mode 100644 index 000000000..f127cfd49 --- /dev/null +++ b/other_tool/getter-setter-fix-plugin-gradle/gradlew.bat @@ -0,0 +1,91 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%"=="" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%"=="" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if %ERRORLEVEL% equ 0 goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if %ERRORLEVEL% equ 0 goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/other_tool/kspcomplier/build.gradle.kts b/other_tool/kspcomplier/build.gradle.kts index 781aecc72..4caca8de8 100644 --- a/other_tool/kspcomplier/build.gradle.kts +++ b/other_tool/kspcomplier/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "dev.umerov.ksp" -version = "1.0-SNAPSHOT" +version = "1.0" repositories { mavenCentral() @@ -13,10 +13,10 @@ repositories { dependencies { implementation(kotlin("stdlib")) - implementation("androidx.annotation:annotation:1.4.0") - implementation("org.jetbrains.kotlinx:kotlinx-serialization-core:1.4.0-RC") + implementation("androidx.annotation:annotation:1.6.0") + implementation("org.jetbrains.kotlinx:kotlinx-serialization-core:1.5.1") - implementation("com.google.devtools.ksp:symbol-processing-api:1.7.0-1.0.6") + implementation("com.google.devtools.ksp:symbol-processing-api:1.9.0-Beta-1.0.11") } sourceSets.main { diff --git a/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt b/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt index baf43f3b5..1115f1036 100644 --- a/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt +++ b/picasso3/src/main/kotlin/com/squareup/picasso3/PicassoDrawable.kt @@ -16,7 +16,13 @@ package com.squareup.picasso3 import android.content.Context -import android.graphics.* +import android.graphics.Bitmap +import android.graphics.Canvas +import android.graphics.Color +import android.graphics.ColorFilter +import android.graphics.Paint +import android.graphics.Path +import android.graphics.Rect import android.graphics.drawable.Animatable import android.graphics.drawable.BitmapDrawable import android.graphics.drawable.Drawable diff --git a/viewpager2/build.gradle b/viewpager2/build.gradle index 68b9b1e7b..24906499a 100644 --- a/viewpager2/build.gradle +++ b/viewpager2/build.gradle @@ -27,8 +27,10 @@ android { } gradle.projectsEvaluated { - tasks.withType(JavaCompile) { - options.compilerArgs << "-Xlint:deprecation" + tasks.withType(JavaCompile).tap { + configureEach { + options.compilerArgs << "-Xlint:deprecation" + } } } } diff --git a/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java b/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java index 198b70f95..5bfee529a 100644 --- a/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java +++ b/viewpager2/src/main/java/androidx/recyclerview/widget/ChildHelper.java @@ -20,6 +20,8 @@ import android.view.View; import android.view.ViewGroup; +import androidx.annotation.NonNull; + import java.util.ArrayList; import java.util.List; @@ -358,9 +360,10 @@ void unhide(View view) { unhideViewInternal(view); } + @NonNull @Override public String toString() { - return mBucket.toString() + ", hidden list:" + mHiddenViews.size(); + return mBucket + ", hidden list:" + mHiddenViews.size(); } /** @@ -503,10 +506,11 @@ int countOnesBefore(int index) { } } + @NonNull @Override public String toString() { return mNext == null ? Long.toBinaryString(mData) - : mNext.toString() + "xx" + Long.toBinaryString(mData); + : mNext + "xx" + Long.toBinaryString(mData); } }