, V : IMvpVie
AbsMvpBottomSheetDialogFragment(), IMvpView, IErrorView, IToastView, IToolbarView {
override fun showError(errorText: String?) {
- customToast?.showToastError(errorText)
+ if (isAdded) {
+ customToast?.showToastError(errorText)
+ }
}
override val customToast: AbsCustomToast?
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/settings/backup/SettingsBackup.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/settings/backup/SettingsBackup.kt
index 5ed7562b1..7f677c0ae 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/settings/backup/SettingsBackup.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/settings/backup/SettingsBackup.kt
@@ -32,6 +32,7 @@ class SettingsBackup {
var download_photo_tap: Boolean? = null
var show_photos_line: Boolean? = null
var instant_photo_display: Boolean? = null
+ var validate_tls: Boolean? = null
var picasso_dispatcher: String? = null
var audio_round_icon: Boolean? = null
var use_long_click_download: Boolean? = null
diff --git a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/serializeble/json/internal/JsonExceptions.kt b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/serializeble/json/internal/JsonExceptions.kt
index ff14dbd75..979624cc7 100644
--- a/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/serializeble/json/internal/JsonExceptions.kt
+++ b/app_filegallery/src/main/kotlin/dev/ragnarok/filegallery/util/serializeble/json/internal/JsonExceptions.kt
@@ -56,7 +56,7 @@ internal fun AbstractJsonLexer.invalidTrailingComma(entity: String = "object"):
fail(
"Trailing comma before the end of JSON $entity",
position = currentPosition - 1,
- hint = "Trailing commas are non-complaint JSON and not allowed by default. Use 'allowTrailingCommas = true' in 'Json {}' builder to support them."
+ hint = "Trailing commas are non-complaint JSON and not allowed by default. Use 'allowTrailingComma = true' in 'Json {}' builder to support them."
)
}
diff --git a/build.gradle b/build.gradle
index 9a6261394..ce57bb856 100644
--- a/build.gradle
+++ b/build.gradle
@@ -34,7 +34,7 @@ buildscript {
ext.graphicsShapesVersion = "1.0.1"
ext.lifecycleVersion = "2.8.7"
ext.mediaVersion = "1.7.0"
- ext.media3Version = "1.5.0-rc02"
+ ext.media3Version = "1.5.0"
ext.resourceInspectionAnnotation = "1.0.1"
ext.savedStateVersion = "1.3.0-alpha05"
ext.swiperefreshlayoutVersion = "1.2.0-alpha01"
@@ -58,7 +58,7 @@ buildscript {
ext.autoValueVersion = "1.11.0"
//common libraries
- ext.kotlin_version = "2.1.0-RC2"
+ ext.kotlin_version = "2.1.0"
ext.kotlin_coroutines = "1.9.0"
ext.kotlin_serializer = "1.7.3"
ext.okhttpLibraryVersion = "5.0.0-SNAPSHOT"
@@ -92,7 +92,7 @@ buildscript {
//maven { url 'https://s01.oss.sonatype.org/content/repositories/snapshots/' }
}
dependencies {
- classpath "com.android.tools.build:gradle:8.7.2"
+ classpath "com.android.tools.build:gradle:8.8.0-rc01"
classpath "com.google.gms:google-services:4.4.2"
classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
classpath "org.jetbrains.kotlin:kotlin-serialization:$kotlin_version"
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
index e52cec12e..0108df974 100644
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
#Wed Mar 20 16:00:00 MSK 2024
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.11.1-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
diff --git a/libfenrir/build.gradle b/libfenrir/build.gradle
index c05556242..af4f0df7b 100644
--- a/libfenrir/build.gradle
+++ b/libfenrir/build.gradle
@@ -49,7 +49,7 @@ android {
externalNativeBuild {
cmake {
- version = "3.31.0"
+ version = "3.31.1"
path = file("src/main/jni/CMakeLists.txt")
}
}
diff --git a/libfenrir/src/main/java/com/github/luben/zstd/Zstd.java b/libfenrir/src/main/java/com/github/luben/zstd/Zstd.java
index a40b8ac71..56d538f3c 100644
--- a/libfenrir/src/main/java/com/github/luben/zstd/Zstd.java
+++ b/libfenrir/src/main/java/com/github/luben/zstd/Zstd.java
@@ -531,7 +531,7 @@ public static long decompressDirectByteBufferFastDict(ByteBuffer dst, int dstOff
public static native int loadDictCompress(long stream, byte[] dict, int dict_size);
public static native int loadFastDictCompress(long stream, ZstdDictCompress dict);
// TODO: Fix native compilation
- //public static native void registerSequenceProducer(long stream, long seqProdState, long seqProdFunction);
+ public static native void registerSequenceProducer(long stream, long seqProdState, long seqProdFunction);
// static native long getBuiltinSequenceProducer(); // Used in tests
static native long getStubSequenceProducer(); // Used in tests
public static native int setCompressionChecksums(long stream, boolean useChecksums);
diff --git a/libfenrir/src/main/java/com/github/luben/zstd/ZstdCompressCtx.java b/libfenrir/src/main/java/com/github/luben/zstd/ZstdCompressCtx.java
index 06539e097..6a3f9c1f3 100644
--- a/libfenrir/src/main/java/com/github/luben/zstd/ZstdCompressCtx.java
+++ b/libfenrir/src/main/java/com/github/luben/zstd/ZstdCompressCtx.java
@@ -274,7 +274,6 @@ public ZstdCompressCtx setLong(int windowLog) {
* Register an external sequence producer
* @param producer the user-defined {@link SequenceProducer} to register.
*/
- /* TODO: fix compilation
public ZstdCompressCtx registerSequenceProducer(SequenceProducer producer) {
ensureOpen();
acquireSharedLock();
@@ -300,7 +299,6 @@ public ZstdCompressCtx registerSequenceProducer(SequenceProducer producer) {
}
return this;
}
- */
/**
* Enable or disable sequence producer fallback
@@ -361,7 +359,7 @@ public ZstdCompressCtx setValidateSequences(Zstd.ParamSwitch validateSequences)
/**
* Enable or disable long-distance matching.
- * @param enableLDM whether to enable long-distance matching.
+ * @param ldm whether to enable long-distance matching.
*/
public ZstdCompressCtx setEnableLongDistanceMatching(Zstd.ParamSwitch enableLDM) {
ensureOpen();
@@ -433,7 +431,12 @@ public ZstdCompressCtx loadDict(byte[] dict) {
*/
public ZstdFrameProgression getFrameProgression() {
ensureOpen();
- return getFrameProgression0(nativePtr);
+ acquireSharedLock();
+ try {
+ return getFrameProgression0(nativePtr);
+ } finally {
+ releaseSharedLock();
+ }
}
private static native ZstdFrameProgression getFrameProgression0(long ptr);
@@ -443,10 +446,16 @@ public ZstdFrameProgression getFrameProgression() {
*/
public void reset() {
ensureOpen();
- long result = reset0(nativePtr);
- if (Zstd.isError(result)) {
- throw new ZstdException(result);
+ acquireSharedLock();
+ try {
+ long result = reset0(nativePtr);
+ if (Zstd.isError(result)) {
+ throw new ZstdException(result);
+ }
+ } finally {
+ releaseSharedLock();
}
+
}
private static native long reset0(long ptr);
@@ -460,9 +469,14 @@ public void reset() {
*/
public void setPledgedSrcSize(long srcSize) {
ensureOpen();
- long result = setPledgedSrcSize0(nativePtr, srcSize);
- if (Zstd.isError(result)) {
- throw new ZstdException(result);
+ acquireSharedLock();
+ try {
+ long result = setPledgedSrcSize0(nativePtr, srcSize);
+ if (Zstd.isError(result)) {
+ throw new ZstdException(result);
+ }
+ } finally {
+ releaseSharedLock();
}
}
private static native long setPledgedSrcSize0(long ptr, long srcSize);
@@ -478,14 +492,19 @@ public void setPledgedSrcSize(long srcSize) {
*/
public boolean compressDirectByteBufferStream(ByteBuffer dst, ByteBuffer src, EndDirective endOp) {
ensureOpen();
- long result = compressDirectByteBufferStream0(nativePtr, dst, dst.position(), dst.limit(), src, src.position(), src.limit(), endOp.value());
- if ((result & 0x80000000L) != 0) {
- long code = result & 0xFF;
- throw new ZstdException(code, Zstd.getErrorName(code));
+ acquireSharedLock();
+ try {
+ long result = compressDirectByteBufferStream0(nativePtr, dst, dst.position(), dst.limit(), src, src.position(), src.limit(), endOp.value());
+ if ((result & 0x80000000L) != 0) {
+ long code = result & 0xFF;
+ throw new ZstdException(code, Zstd.getErrorName(code));
+ }
+ src.position((int)(result & 0x7FFFFFFF));
+ dst.position((int)(result >>> 32) & 0x7FFFFFFF);
+ return (result >>> 63) == 1;
+ } finally {
+ releaseSharedLock();
}
- src.position((int)(result & 0x7FFFFFFF));
- dst.position((int)(result >>> 32) & 0x7FFFFFFF);
- return (result >>> 63) == 1;
}
/**
@@ -600,7 +619,6 @@ public int compressByteArray(byte[] dstBuff, int dstOffset, int dstSize, byte[]
* @return the size of the compressed data
*/
public int compress(ByteBuffer dstBuf, ByteBuffer srcBuf) {
-
int size = compressDirectByteBuffer(dstBuf, // compress into dstBuf
dstBuf.position(), // write compressed data starting at offset position()
dstBuf.limit() - dstBuf.position(), // write no more than limit() - position() bytes
diff --git a/libfenrir/src/main/java/com/github/luben/zstd/ZstdDecompressCtx.java b/libfenrir/src/main/java/com/github/luben/zstd/ZstdDecompressCtx.java
index 7c68540ce..d61d9a1df 100644
--- a/libfenrir/src/main/java/com/github/luben/zstd/ZstdDecompressCtx.java
+++ b/libfenrir/src/main/java/com/github/luben/zstd/ZstdDecompressCtx.java
@@ -95,9 +95,18 @@ public ZstdDecompressCtx loadDict(byte[] dict) {
*/
public void reset() {
ensureOpen();
- reset0(nativePtr);
+ acquireSharedLock();
+ try {
+ long result = reset0(nativePtr);
+ if (Zstd.isError(result)) {
+ throw new ZstdException(result);
+ }
+ } finally {
+ releaseSharedLock();
+ }
+
}
- private static native void reset0(long nativePtr);
+ private static native long reset0(long nativePtr);
private void ensureOpen() {
if (nativePtr == 0) {
@@ -115,14 +124,19 @@ private void ensureOpen() {
*/
public boolean decompressDirectByteBufferStream(ByteBuffer dst, ByteBuffer src) {
ensureOpen();
- long result = decompressDirectByteBufferStream0(nativePtr, dst, dst.position(), dst.limit(), src, src.position(), src.limit());
- if ((result & 0x80000000L) != 0) {
- long code = result & 0xFF;
- throw new ZstdException(code, Zstd.getErrorName(code));
+ acquireSharedLock();
+ try {
+ long result = decompressDirectByteBufferStream0(nativePtr, dst, dst.position(), dst.limit(), src, src.position(), src.limit());
+ if ((result & 0x80000000L) != 0) {
+ long code = result & 0xFF;
+ throw new ZstdException(code, Zstd.getErrorName(code));
+ }
+ src.position((int)(result & 0x7FFFFFFF));
+ dst.position((int)(result >>> 32) & 0x7FFFFFFF);
+ return (result >>> 63) == 1;
+ } finally {
+ releaseSharedLock();
}
- src.position((int)(result & 0x7FFFFFFF));
- dst.position((int)(result >>> 32) & 0x7FFFFFFF);
- return (result >>> 63) == 1;
}
/**
@@ -236,7 +250,6 @@ public int decompressByteArray(byte[] dstBuff, int dstOffset, int dstSize, byte[
* @return the size of the decompressed data.
*/
public int decompress(ByteBuffer dstBuf, ByteBuffer srcBuf) throws ZstdException {
-
int size = decompressDirectByteBuffer(dstBuf, // decompress into dstBuf
dstBuf.position(), // write decompressed data at offset position()
dstBuf.limit() - dstBuf.position(), // write no more than limit() - position()
diff --git a/libfenrir/src/main/jni/CMakeLists.txt b/libfenrir/src/main/jni/CMakeLists.txt
index d83d5f6a3..fb8281ff5 100644
--- a/libfenrir/src/main/jni/CMakeLists.txt
+++ b/libfenrir/src/main/jni/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.31.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.31.1 FATAL_ERROR)
project(fenrir_jni C CXX ASM)
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
index 70f89134c..02fa935ca 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/row.h
@@ -553,8 +553,15 @@ extern "C" {
#define HAS_BGRATOUVROW_SVE2
#define HAS_DIVIDEROW_16_SVE2
#define HAS_HALFFLOATROW_SVE2
+#define HAS_I210ALPHATOARGBROW_SVE2
+#define HAS_I210TOAR30ROW_SVE2
#define HAS_I210TOARGBROW_SVE2
+#define HAS_I212TOAR30ROW_SVE2
+#define HAS_I212TOARGBROW_SVE2
#define HAS_I400TOARGBROW_SVE2
+#define HAS_I410ALPHATOARGBROW_SVE2
+#define HAS_I410TOAR30ROW_SVE2
+#define HAS_I410TOARGBROW_SVE2
#define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGB1555ROW_SVE2
#define HAS_I422TOARGB4444ROW_SVE2
@@ -565,7 +572,9 @@ extern "C" {
#define HAS_I444ALPHATOARGBROW_SVE2
#define HAS_I444TOARGBROW_SVE2
#define HAS_NV12TOARGBROW_SVE2
+#define HAS_NV12TORGB24ROW_SVE2
#define HAS_NV21TOARGBROW_SVE2
+#define HAS_NV21TORGB24ROW_SVE2
#define HAS_P210TOAR30ROW_SVE2
#define HAS_P210TOARGBROW_SVE2
#define HAS_P410TOAR30ROW_SVE2
@@ -582,8 +591,10 @@ extern "C" {
// The following are available on AArch64 SME platforms:
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
+#define HAS_ARGBMULTIPLYROW_SME
#define HAS_I422TOARGBROW_SME
#define HAS_I444TOARGBROW_SME
+#define HAS_MULTIPLYROW_16_SME
#endif
// The following are available on AArch64 platforms:
@@ -1064,6 +1075,13 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I210AlphaToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@@ -1071,6 +1089,13 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I410AlphaToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1113,30 +1138,60 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I410ToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I210ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I410ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I212ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I212ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
+void I212ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1279,11 +1334,21 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
+void NV12ToRGB24Row_SVE2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
@@ -3321,6 +3386,10 @@ void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int scale,
int width);
+void MultiplyRow_16_SME(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
void DivideRow_16_C(const uint16_t* src_y,
uint16_t* dst_y,
@@ -4988,6 +5057,10 @@ void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
+void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
@@ -6670,14 +6743,6 @@ void HalfFloatRow_SVE2(const uint16_t* src,
uint16_t* dst,
float scale,
int width);
-void HalfFloat1Row_NEON(const uint16_t* src,
- uint16_t* dst,
- float scale,
- int width);
-void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
- uint16_t* dst_ptr,
- float param,
- int width);
void HalfFloat1Row_SVE2(const uint16_t* src,
uint16_t* dst,
float scale,
diff --git a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/scale_row.h b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/scale_row.h
index 101ccbf84..3747c318e 100644
--- a/libfenrir/src/main/jni/animation/libyuv/include/libyuv/scale_row.h
+++ b/libfenrir/src/main/jni/animation/libyuv/include/libyuv/scale_row.h
@@ -116,6 +116,11 @@ extern "C" {
#define HAS_SCALEUVROWUP2_BILINEAR_16_NEON
#endif
+// The following are available on AArch64 Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALEROWDOWN2_16_NEON
+#endif
+
// The following are available on AArch64 SME platforms:
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
@@ -1423,6 +1428,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
+void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
void ScaleRowDown2_SME(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -1431,6 +1440,10 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
+void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
void ScaleRowDown2Linear_SME(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
index 7a2f7813f..1405d2bea 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/convert_argb.cc
@@ -977,6 +977,11 @@ int I010ToAR30Matrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I210TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I210ToAR30Row = I210ToAR30Row_SVE2;
+ }
+#endif
#if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@@ -1160,6 +1165,11 @@ int I012ToAR30Matrix(const uint16_t* src_y,
I212ToAR30Row = I212ToAR30Row_NEON;
}
}
+#endif
+#if defined(HAS_I212TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I212ToAR30Row = I212ToAR30Row_SVE2;
+ }
#endif
for (y = 0; y < height; ++y) {
I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
@@ -1211,6 +1221,11 @@ int I210ToAR30Matrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I210TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I210ToAR30Row = I210ToAR30Row_SVE2;
+ }
+#endif
#if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@@ -1374,6 +1389,11 @@ int I410ToAR30Matrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToAR30Row = I410ToAR30Row_SVE2;
+ }
+#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@@ -1628,6 +1648,11 @@ int I012ToARGBMatrix(const uint16_t* src_y,
I212ToARGBRow = I212ToARGBRow_NEON;
}
}
+#endif
+#if defined(HAS_I212TOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I212ToARGBRow = I212ToARGBRow_SVE2;
+ }
#endif
for (y = 0; y < height; ++y) {
I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -1859,6 +1884,11 @@ int I410ToARGBMatrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToARGBRow = I410ToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@@ -2259,6 +2289,22 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2407,6 +2453,22 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2537,6 +2599,22 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2725,6 +2803,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I210ALPHATOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@@ -2778,6 +2861,22 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2839,6 +2938,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I210ALPHATOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@@ -2892,6 +2996,22 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -2951,6 +3071,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410ALPHATOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@@ -3004,6 +3129,22 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -4469,6 +4610,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV12TORGB24ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_SVE2;
+ }
+#endif
#if defined(HAS_NV12TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
@@ -4535,6 +4681,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TORGB24ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_SVE2;
+ }
+#endif
#if defined(HAS_NV21TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
@@ -6937,6 +7088,11 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToAR30Row = I410ToAR30Row_SVE2;
+ }
+#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@@ -7051,6 +7207,11 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOAR30ROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToAR30Row = I410ToAR30Row_SVE2;
+ }
+#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@@ -7152,6 +7313,11 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToARGBRow = I410ToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@@ -7265,6 +7431,11 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410TOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410ToARGBRow = I410ToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@@ -7438,6 +7609,22 @@ static int I420AlphaToARGBMatrixBilinear(
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -7653,6 +7840,22 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -7754,6 +7957,11 @@ static int I010AlphaToARGBMatrixBilinear(
}
}
#endif
+#if defined(HAS_I410ALPHATOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@@ -7807,6 +8015,22 @@ static int I010AlphaToARGBMatrixBilinear(
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -7930,6 +8154,11 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_I410ALPHATOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
+ }
+#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@@ -7983,6 +8212,22 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
ARGBAttenuateRow = ARGBAttenuateRow_RVV;
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
index be67a1ded..cc909eb73 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/planar_functions.cc
@@ -829,6 +829,11 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_MULTIPLYROW_16_SME)
+ if (TestCpuFlag(kCpuHasSME)) {
+ MultiplyRow_16 = MultiplyRow_16_SME;
+ }
+#endif
for (y = 0; y < height; ++y) {
MultiplyRow_16(src_y, dst_y, scale, width);
@@ -3134,6 +3139,11 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_SME)
+ if (TestCpuFlag(kCpuHasSME)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SME;
+ }
+#endif
#if defined(HAS_ARGBMULTIPLYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
@@ -5208,11 +5218,18 @@ int HalfFloatPlane(const uint16_t* src_y,
}
#endif
#if defined(HAS_HALFFLOATROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- HalfFloatRow =
- scale == 1.0f ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+ if (TestCpuFlag(kCpuHasNEON)
+#if defined(__arm__)
+ // When scale is 1/65535 the scale * 2^-112 used to convert is a denormal.
+ // But when Neon vmul is asked to multiply a normal float by that
+ // denormal scale, even though the result would have been normal, it
+ // flushes to zero. The scalar version of vmul supports denormals.
+ && scale >= 1.0f / 4096.0f
+#endif
+ ) {
+ HalfFloatRow = HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
- HalfFloatRow = scale == 1.0f ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+ HalfFloatRow = HalfFloatRow_NEON;
}
}
#endif
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_any.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_any.cc
index a61ab817c..70ab046ec 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_any.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_any.cc
@@ -1813,25 +1813,7 @@ ANY11P16(HalfFloat1Row_Any_F16C,
15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
-#ifdef __aarch64__
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_NEON,
- HalfFloat1Row_NEON,
- uint16_t,
- uint16_t,
- 2,
- 2,
- 15)
-#else
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON,
- HalfFloat1Row_NEON,
- uint16_t,
- uint16_t,
- 2,
- 2,
- 7)
-#endif
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
index 6d49aa5e8..734d7ee29 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lasx.cc
@@ -1148,24 +1148,26 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
__m256i b, g, r, a, dst0, dst1;
__m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
0x0007000300060002};
+ __m256i zero = __lasx_xvldi(0);
+ __m256i const_add = __lasx_xvldi(0x8ff);
for (x = 0; x < len; x++) {
DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
tmp0 = __lasx_xvpickev_b(src1, src0);
tmp1 = __lasx_xvpickod_b(src1, src0);
- b = __lasx_xvpackev_b(tmp0, tmp0);
- r = __lasx_xvpackod_b(tmp0, tmp0);
- g = __lasx_xvpackev_b(tmp1, tmp1);
- a = __lasx_xvpackod_b(tmp1, tmp1);
- reg0 = __lasx_xvmulwev_w_hu(b, a);
- reg1 = __lasx_xvmulwod_w_hu(b, a);
- reg2 = __lasx_xvmulwev_w_hu(r, a);
- reg3 = __lasx_xvmulwod_w_hu(r, a);
- reg4 = __lasx_xvmulwev_w_hu(g, a);
- reg5 = __lasx_xvmulwod_w_hu(g, a);
- reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
- reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
- reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
+ b = __lasx_xvpackev_b(zero, tmp0);
+ r = __lasx_xvpackod_b(zero, tmp0);
+ g = __lasx_xvpackev_b(zero, tmp1);
+ a = __lasx_xvpackod_b(zero, tmp1);
+ reg0 = __lasx_xvmaddwev_w_hu(const_add, b, a);
+ reg1 = __lasx_xvmaddwod_w_hu(const_add, b, a);
+ reg2 = __lasx_xvmaddwev_w_hu(const_add, r, a);
+ reg3 = __lasx_xvmaddwod_w_hu(const_add, r, a);
+ reg4 = __lasx_xvmaddwev_w_hu(const_add, g, a);
+ reg5 = __lasx_xvmaddwod_w_hu(const_add, g, a);
+ reg0 = __lasx_xvssrani_h_w(reg1, reg0, 8);
+ reg2 = __lasx_xvssrani_h_w(reg3, reg2, 8);
+ reg4 = __lasx_xvssrani_h_w(reg5, reg4, 8);
reg0 = __lasx_xvshuf_h(control, reg0, reg0);
reg2 = __lasx_xvshuf_h(control, reg2, reg2);
reg4 = __lasx_xvshuf_h(control, reg4, reg4);
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
index ee74cad9f..50d5ba6a0 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_lsx.cc
@@ -1102,24 +1102,26 @@ void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
__m128i reg0, reg1, reg2, reg3, reg4, reg5;
__m128i b, g, r, a, dst0, dst1;
__m128i control = {0x0005000100040000, 0x0007000300060002};
+ __m128i zero = __lsx_vldi(0);
+ __m128i const_add = __lsx_vldi(0x8ff);
for (x = 0; x < len; x++) {
DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
tmp0 = __lsx_vpickev_b(src1, src0);
tmp1 = __lsx_vpickod_b(src1, src0);
- b = __lsx_vpackev_b(tmp0, tmp0);
- r = __lsx_vpackod_b(tmp0, tmp0);
- g = __lsx_vpackev_b(tmp1, tmp1);
- a = __lsx_vpackod_b(tmp1, tmp1);
- reg0 = __lsx_vmulwev_w_hu(b, a);
- reg1 = __lsx_vmulwod_w_hu(b, a);
- reg2 = __lsx_vmulwev_w_hu(r, a);
- reg3 = __lsx_vmulwod_w_hu(r, a);
- reg4 = __lsx_vmulwev_w_hu(g, a);
- reg5 = __lsx_vmulwod_w_hu(g, a);
- reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
- reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
- reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
+ b = __lsx_vpackev_b(zero, tmp0);
+ r = __lsx_vpackod_b(zero, tmp0);
+ g = __lsx_vpackev_b(zero, tmp1);
+ a = __lsx_vpackod_b(zero, tmp1);
+ reg0 = __lsx_vmaddwev_w_hu(const_add, b, a);
+ reg1 = __lsx_vmaddwod_w_hu(const_add, b, a);
+ reg2 = __lsx_vmaddwev_w_hu(const_add, r, a);
+ reg3 = __lsx_vmaddwod_w_hu(const_add, r, a);
+ reg4 = __lsx_vmaddwev_w_hu(const_add, g, a);
+ reg5 = __lsx_vmaddwod_w_hu(const_add, g, a);
+ reg0 = __lsx_vssrani_h_w(reg1, reg0, 8);
+ reg2 = __lsx_vssrani_h_w(reg3, reg2, 8);
+ reg4 = __lsx_vssrani_h_w(reg5, reg4, 8);
reg0 = __lsx_vshuf_h(control, reg0, reg0);
reg2 = __lsx_vshuf_h(control, reg2, reg2);
reg4 = __lsx_vshuf_h(control, reg4, reg4);
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
index 1211a3727..cfbb364d1 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon.cc
@@ -3536,59 +3536,41 @@ void SobelYRow_NEON(const uint8_t* src_y0,
}
// %y passes a float as a scalar vector for vector * scalar multiply.
-// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the register must be d0 to d15 and indexed with [0] or [1] to access
// the float in the first or second float of the d-reg
-void HalfFloat1Row_NEON(const uint16_t* src,
- uint16_t* dst,
- float /*unused*/,
- int width) {
- asm volatile (
-
- "1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(1.9259299444e-34f) // %3
- : "cc", "memory", "q1", "q2", "q3");
-}
-
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width) {
- asm volatile (
+ asm volatile (
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
+ "vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vmovl.u16 q8, d0 \n"
+ "vmovl.u16 q9, d1 \n"
+ "vmovl.u16 q10, d2 \n"
+ "vmovl.u16 q11, d3 \n"
+ "vcvt.f32.u32 q8, q8 \n"
+ "vcvt.f32.u32 q9, q9 \n"
+ "vcvt.f32.u32 q10, q10 \n"
+ "vcvt.f32.u32 q11, q11 \n"
+ "vmul.f32 q8, q8, %y3 \n" // adjust exponent
+ "vmul.f32 q9, q9, %y3 \n"
+ "vmul.f32 q10, q10, %y3 \n"
+ "vmul.f32 q11, q11, %y3 \n"
+ "vqshrn.u32 d0, q8, #13 \n" // isolate halffloat
+ "vqshrn.u32 d1, q9, #13 \n"
+ "vqshrn.u32 d2, q10, #13 \n"
+ "vqshrn.u32 d3, q11, #13 \n"
+ "vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
"bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
- : "cc", "memory", "q1", "q2", "q3");
+ : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
}
void ByteToFloatRow_NEON(const uint8_t* src,
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
index 4b1ed2c0c..55f686766 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_neon64.cc
@@ -4664,37 +4664,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
);
}
-// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16_t* src,
- uint16_t* dst,
- float /*unused*/,
- int width) {
- asm volatile(
- "1: \n"
- "ldp q0, q1, [%0], #32 \n" // load 16 shorts
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "uxtl v2.4s, v0.4h \n"
- "uxtl v4.4s, v1.4h \n"
- "uxtl2 v3.4s, v0.8h \n"
- "uxtl2 v5.4s, v1.8h \n"
- "prfm pldl1keep, [%0, 448] \n"
- "scvtf v2.4s, v2.4s \n"
- "scvtf v4.4s, v4.4s \n"
- "scvtf v3.4s, v3.4s \n"
- "scvtf v5.4s, v5.4s \n"
- "fcvtn v0.4h, v2.4s \n"
- "fcvtn v1.4h, v4.4s \n"
- "fcvtn2 v0.8h, v3.4s \n"
- "fcvtn2 v1.8h, v5.4s \n"
- "stp q0, q1, [%1], #32 \n" // store 16 shorts
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
-}
-
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
@@ -4717,10 +4686,10 @@ void HalfFloatRow_NEON(const uint16_t* src,
"fmul v3.4s, v3.4s, %3.s[0] \n"
"fmul v5.4s, v5.4s, %3.s[0] \n"
"uqshrn v0.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn v1.4h, v4.4s, #13 \n" // isolate halffloat
+ "uqshrn v1.4h, v4.4s, #13 \n"
"uqshrn2 v0.8h, v3.4s, #13 \n"
"uqshrn2 v1.8h, v5.4s, #13 \n"
- "stp q0, q1, [%1], #32 \n" // store 16 shorts
+ "stp q0, q1, [%1], #32 \n" // store 16 fp16
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_sme.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_sme.cc
index 7676d9e64..da94cd7be 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_sme.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_sme.cc
@@ -216,6 +216,102 @@ __arm_locally_streaming void I422ToARGBRow_SME(
: "cc", "memory", YUVTORGB_SVE_REGS);
}
+__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // Streaming-SVE only, no use of ZA tile.
+ int vl;
+ asm volatile(
+ "cnth %x[vl] \n"
+ "mov z0.h, %w[scale] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p0.h \n"
+ "1: \n"
+ "ld1h {z1.h}, p0/z, [%[src_y]] \n"
+ "incb %[src_y] \n"
+ "mul z1.h, z0.h, z1.h \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st1h {z1.h}, p0, [%[dst_y]] \n"
+ "incb %[dst_y] \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p0.h, wzr, %w[width] \n"
+ "ld1h {z1.h}, p0/z, [%[src_y]] \n"
+ "mul z1.h, z0.h, z1.h \n"
+ "st1h {z1.h}, p0, [%[dst_y]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [dst_y] "+r"(dst_y), // %[dst_y]
+ [width] "+r"(width), // %[width]
+ [vl] "=&r"(vl) // %[vl]
+ : [scale] "r"(scale) // %[scale]
+ : "memory", "cc", "z0", "z1", "p0");
+}
+
+__arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ // Streaming-SVE only, no use of ZA tile.
+ width *= 4;
+ int vl;
+ asm volatile(
+ "cntb %x[vl] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p0.b \n"
+ "1: \n"
+ "ld1b {z0.b}, p0/z, [%[src_argb]] \n"
+ "ld1b {z1.b}, p0/z, [%[src_argb1]] \n"
+ "incb %[src_argb] \n"
+ "incb %[src_argb1] \n"
+ "umullb z2.h, z0.b, z1.b \n"
+ "umullt z1.h, z0.b, z1.b \n"
+ "rshrnb z0.b, z2.h, #8 \n"
+ "rshrnt z0.b, z1.h, #8 \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st1b {z0.b}, p0, [%[dst_argb]] \n"
+ "incb %[dst_argb] \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p0.b, wzr, %w[width] \n"
+ "ld1b {z0.b}, p0/z, [%[src_argb]] \n"
+ "ld1b {z1.b}, p0/z, [%[src_argb1]] \n"
+ "umullb z2.h, z0.b, z1.b \n"
+ "umullt z1.h, z0.b, z1.b \n"
+ "rshrnb z0.b, z2.h, #8 \n"
+ "rshrnt z0.b, z1.h, #8 \n"
+ "st1b {z0.b}, p0, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_argb] "+r"(src_argb), // %[src_argb]
+ [src_argb1] "+r"(src_argb1), // %[src_argb1]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width), // %[width]
+ [vl] "=&r"(vl) // %[vl]
+ :
+ : "memory", "cc", "z0", "z1", "z2", "p0", "p1");
+}
+
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/row_sve.cc b/libfenrir/src/main/jni/animation/libyuv/source/row_sve.cc
index bfa49d9c2..51b22ddd7 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/row_sve.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/row_sve.cc
@@ -68,33 +68,50 @@ extern "C" {
// We need a different predicate for the UV component to handle the tail.
// If there is a single element remaining then we want to load one Y element
// but two UV elements.
-#define READNV_SVE \
- "ld1b {z0.h}, p1/z, [%[src_y]] \n" /* Y0Y0 */ \
- "ld1b {z1.h}, p2/z, [%[src_uv]] \n" /* U0V0 or V0U0 */ \
- "inch %[src_y] \n" \
- "inch %[src_uv] \n" \
+#define READNV_SVE_2X \
+ "ld1b {z0.b}, p1/z, [%[src_y]] \n" /* Y0Y0 */ \
+ "ld1b {z2.b}, p2/z, [%[src_uv]] \n" /* U0V0 or V0U0 */ \
+ "incb %[src_y] \n" \
+ "incb %[src_uv] \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"prfm pldl1keep, [%[src_uv], 256] \n" \
- "trn1 z0.b, z0.b, z0.b \n" /* YYYY */ \
- "tbl z1.b, {z1.b}, z22.b \n" /* UVUV */
+ "trn2 z1.b, z0.b, z0.b \n" /* YYYY */ \
+ "trn1 z0.b, z0.b, z0.b \n" /* YYYY */
#define READI210_SVE \
"ld1h {z3.h}, p1/z, [%[src_y]] \n" \
- "lsl z0.h, z3.h, #6 \n" \
- "usra z0.h, z3.h, #4 \n" \
"ld1h {z1.s}, p1/z, [%[src_u]] \n" \
"ld1h {z2.s}, p1/z, [%[src_v]] \n" \
"incb %[src_y] \n" \
"inch %[src_u] \n" \
"inch %[src_v] \n" \
+ "lsl z0.h, z3.h, #6 \n" \
+ "trn1 z1.h, z1.h, z1.h \n" \
+ "trn1 z2.h, z2.h, z2.h \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n" \
- "trn1 z1.h, z1.h, z1.h \n" \
- "trn1 z2.h, z2.h, z2.h \n" \
+ "usra z0.h, z3.h, #4 \n" \
"uqshrnb z1.b, z1.h, #2 \n" \
"uqshrnb z2.b, z2.h, #2 \n"
+#define READI212_SVE \
+ "ld1h {z3.h}, p1/z, [%[src_y]] \n" \
+ "ld1h {z1.s}, p1/z, [%[src_u]] \n" \
+ "ld1h {z2.s}, p1/z, [%[src_v]] \n" \
+ "incb %[src_y] \n" \
+ "inch %[src_u] \n" \
+ "inch %[src_v] \n" \
+ "lsl z0.h, z3.h, #4 \n" \
+ "trn1 z1.h, z1.h, z1.h \n" \
+ "trn1 z2.h, z2.h, z2.h \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "prfm pldl1keep, [%[src_u], 128] \n" \
+ "prfm pldl1keep, [%[src_v], 128] \n" \
+ "usra z0.h, z3.h, #8 \n" \
+ "uqshrnb z1.b, z1.h, #4 \n" \
+ "uqshrnb z2.b, z2.h, #4 \n"
+
#define READP210_SVE \
"ld1h {z0.h}, p1/z, [%[src_y]] \n" \
"ld1h {z1.h}, p2/z, [%[src_uv]] \n" \
@@ -104,6 +121,21 @@ extern "C" {
"prfm pldl1keep, [%[src_uv], 256] \n" \
"tbl z1.b, {z1.b}, z22.b \n"
+#define READI410_SVE \
+ "ld1h {z3.h}, p1/z, [%[src_y]] \n" \
+ "lsl z0.h, z3.h, #6 \n" \
+ "usra z0.h, z3.h, #4 \n" \
+ "ld1h {z1.h}, p1/z, [%[src_u]] \n" \
+ "ld1h {z2.h}, p1/z, [%[src_v]] \n" \
+ "incb %[src_y] \n" \
+ "incb %[src_u] \n" \
+ "incb %[src_v] \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "prfm pldl1keep, [%[src_u], 128] \n" \
+ "prfm pldl1keep, [%[src_v], 128] \n" \
+ "uqshrnb z1.b, z1.h, #2 \n" \
+ "uqshrnb z2.b, z2.h, #2 \n"
+
// We need different predicates for the UV components since we are reading
// 32-bit (pairs of UV) elements rather than 16-bit Y elements.
#define READP410_SVE \
@@ -193,6 +225,27 @@ extern "C" {
"uqsub z18.h, z18.h, z27.h \n" /* R0 */ \
"uqsub z22.h, z22.h, z27.h \n" /* R1 */
+#define NVTORGB_SVE_2X(bt_u, bt_v) \
+ "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
+ "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
+ "umull" #bt_u " z6.h, z30.b, z2.b \n" \
+ "umull" #bt_u " z4.h, z28.b, z2.b \n" /* DB */ \
+ "umull" #bt_v " z5.h, z29.b, z2.b \n" /* DR */ \
+ "umlal" #bt_v " z6.h, z31.b, z2.b \n" /* DG */ \
+ \
+ "add z17.h, z0.h, z26.h \n" /* G0 */ \
+ "add z21.h, z1.h, z26.h \n" /* G1 */ \
+ "add z16.h, z0.h, z4.h \n" /* B0 */ \
+ "add z20.h, z1.h, z4.h \n" /* B1 */ \
+ "add z18.h, z0.h, z5.h \n" /* R0 */ \
+ "add z22.h, z1.h, z5.h \n" /* R1 */ \
+ "uqsub z17.h, z17.h, z6.h \n" /* G0 */ \
+ "uqsub z21.h, z21.h, z6.h \n" /* G1 */ \
+ "uqsub z16.h, z16.h, z25.h \n" /* B0 */ \
+ "uqsub z20.h, z20.h, z25.h \n" /* B1 */ \
+ "uqsub z18.h, z18.h, z27.h \n" /* R0 */ \
+ "uqsub z22.h, z22.h, z27.h \n" /* R1 */
+
#define I400TORGB_SVE \
"umulh z18.h, z24.h, z0.h \n" /* Y */ \
"movprfx z16, z18 \n" \
@@ -210,6 +263,13 @@ extern "C" {
"uqshrnt z16.b, z17.h, #6 \n" /* BG */ \
"trn1 z17.b, z18.b, z19.b \n" /* RA */
+#define RGBATOARGB8_SVE \
+ /* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.h */ \
+ "uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
+ "uqshrnt z16.b, z17.h, #6 \n" /* BG */ \
+ "uqshrnb z17.b, z18.h, #6 \n" /* R0 */ \
+ "uqshrnt z17.b, z19.h, #2 \n" /* RA */
+
#define RGBTOARGB8_SVE_2X \
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
"uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
@@ -745,33 +805,36 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
-static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- uint32_t nv_uv_start,
- uint32_t nv_uv_step) {
+void NV12ToARGBRow_SVE2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint32_t nv_u_start = 0xff00U;
+ uint32_t nv_u_step = 0x0002U;
+ uint32_t nv_v_start = 0xff01U;
+ uint32_t nv_v_step = 0x0002U;
uint64_t vl;
- asm("cnth %0" : "=r"(vl));
+ asm("cntb %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
int width_last_uv = width_last_y + (width_last_y & 1);
asm volatile(
"ptrue p0.b \n" //
YUVTORGB_SVE_SETUP
- "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
"dup z19.b, #255 \n" // A
+ "index z7.h, %w[nv_u_start], %w[nv_u_step] \n"
+ "index z23.h, %w[nv_v_start], %w[nv_v_step] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
- "ptrue p1.h \n"
- "ptrue p2.h \n"
+ "ptrue p1.b \n"
+ "ptrue p2.b \n"
"1: \n" //
- READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
+ READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
"subs %w[width], %w[width], %w[vl] \n"
- "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.ge 1b \n"
@@ -780,11 +843,10 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
- "3: \n"
- "whilelt p1.h, wzr, %w[width_last_y] \n"
- "whilelt p2.h, wzr, %w[width_last_uv] \n" //
- READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
- "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "whilelt p1.b, wzr, %w[width_last_y] \n"
+ "whilelt p2.b, wzr, %w[width_last_uv] \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
+ "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
@@ -794,33 +856,193 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
: [vl] "r"(vl), // %[vl]
[kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
- [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start]
- [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step]
+ [nv_u_start] "r"(nv_u_start), // %[nv_u_start]
+ [nv_u_step] "r"(nv_u_step), // %[nv_u_step]
+ [nv_v_start] "r"(nv_v_start), // %[nv_v_start]
+ [nv_v_step] "r"(nv_v_step), // %[nv_v_step]
[width_last_y] "r"(width_last_y), // %[width_last_y]
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
}
-void NV12ToARGBRow_SVE2(const uint8_t* src_y,
- const uint8_t* src_uv,
- uint8_t* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- uint32_t nv_uv_start = 0x02000200U;
- uint32_t nv_uv_step = 0x04040404U;
- NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
- nv_uv_step);
-}
-
void NV21ToARGBRow_SVE2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- uint32_t nv_uv_start = 0x00020002U;
- uint32_t nv_uv_step = 0x04040404U;
- NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
- nv_uv_step);
+ uint32_t nv_u_start = 0xff01U;
+ uint32_t nv_u_step = 0x0002U;
+ uint32_t nv_v_start = 0xff00U;
+ uint32_t nv_v_step = 0x0002U;
+ uint64_t vl;
+ asm("cntb %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ int width_last_uv = width_last_y + (width_last_y & 1);
+ asm volatile(
+ "ptrue p0.b \n" //
+ YUVTORGB_SVE_SETUP
+ "dup z19.b, #255 \n" // A
+ "index z7.h, %w[nv_u_start], %w[nv_u_step] \n"
+ "index z23.h, %w[nv_v_start], %w[nv_v_step] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.b \n"
+ "ptrue p2.b \n"
+ "1: \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
+ "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.b, wzr, %w[width_last_y] \n"
+ "whilelt p2.b, wzr, %w[width_last_uv] \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
+ "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_vu]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [nv_u_start] "r"(nv_u_start), // %[nv_u_start]
+ [nv_u_step] "r"(nv_u_step), // %[nv_u_step]
+ [nv_v_start] "r"(nv_v_start), // %[nv_v_start]
+ [nv_v_step] "r"(nv_v_step), // %[nv_v_step]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [width_last_uv] "r"(width_last_uv) // %[width_last_uv]
+ : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
+}
+
+void NV12ToRGB24Row_SVE2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint32_t nv_u_start = 0xff00U;
+ uint32_t nv_u_step = 0x0002U;
+ uint32_t nv_v_start = 0xff01U;
+ uint32_t nv_v_step = 0x0002U;
+ uint64_t vl;
+ asm("cntb %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ int width_last_uv = width_last_y + (width_last_y & 1);
+ asm volatile(
+ "ptrue p0.b \n" //
+ YUVTORGB_SVE_SETUP
+ "dup z19.b, #255 \n" // A
+ "index z7.h, %w[nv_u_start], %w[nv_u_step] \n"
+ "index z23.h, %w[nv_v_start], %w[nv_v_step] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.b \n"
+ "ptrue p2.b \n"
+ "1: \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+ "incb %[dst_rgb24], all, mul #3 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.b, wzr, %w[width_last_y] \n"
+ "whilelt p2.b, wzr, %w[width_last_uv] \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
+ "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [nv_u_start] "r"(nv_u_start), // %[nv_u_start]
+ [nv_u_step] "r"(nv_u_step), // %[nv_u_step]
+ [nv_v_start] "r"(nv_v_start), // %[nv_v_start]
+ [nv_v_step] "r"(nv_v_step), // %[nv_v_step]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [width_last_uv] "r"(width_last_uv) // %[width_last_uv]
+ : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
+}
+
+void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint32_t nv_u_start = 0xff01U;
+ uint32_t nv_u_step = 0x0002U;
+ uint32_t nv_v_start = 0xff00U;
+ uint32_t nv_v_step = 0x0002U;
+ uint64_t vl;
+ asm("cntb %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ int width_last_uv = width_last_y + (width_last_y & 1);
+ asm volatile(
+ "ptrue p0.b \n" //
+ YUVTORGB_SVE_SETUP
+ "dup z19.b, #255 \n" // A
+ "index z7.h, %w[nv_u_start], %w[nv_u_step] \n"
+ "index z23.h, %w[nv_v_start], %w[nv_v_step] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.b \n"
+ "ptrue p2.b \n"
+ "1: \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+ "incb %[dst_rgb24], all, mul #3 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.b, wzr, %w[width_last_y] \n"
+ "whilelt p2.b, wzr, %w[width_last_uv] \n" //
+ READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
+ "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_vu]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [nv_u_start] "r"(nv_u_start), // %[nv_u_start]
+ [nv_u_step] "r"(nv_u_step), // %[nv_u_step]
+ [nv_v_start] "r"(nv_v_start), // %[nv_v_start]
+ [nv_v_step] "r"(nv_v_step), // %[nv_v_step]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [width_last_uv] "r"(width_last_uv) // %[width_last_uv]
+ : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
}
// Dot-product constants are stored as four-tuples with the two innermost
@@ -1848,7 +2070,6 @@ void I210ToARGBRow_SVE2(const uint16_t* src_y,
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
- width_last_y = width_last_y == 0 ? vl : width_last_y;
asm volatile(
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A
@@ -1887,6 +2108,102 @@ void I210ToARGBRow_SVE2(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
+void I210AlphaToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" READI210_SVE
+ "ld1h {z19.h}, p1/z, [%[src_a]] \n" I4XXTORGB_SVE
+ "incb %[src_a] \n" RGBATOARGB8_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" READI210_SVE
+ "ld1h {z19.h}, p1/z, [%[src_a]] \n" //
+ I4XXTORGB_SVE RGBATOARGB8_SVE
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y) // %[width_last_y]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
+void I210ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ uint16_t limit = 0x3ff0;
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "dup z23.h, %w[limit] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" //
+ READI210_SVE I4XXTORGB_SVE STOREAR30_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" //
+ READI210_SVE I4XXTORGB_SVE STOREAR30_SVE
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [limit] "r"(limit) // %[limit]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
// P210 has 10 bits in msb of 16 bit NV12 style layout.
void P210ToARGBRow_SVE2(const uint16_t* src_y,
const uint16_t* src_uv,
@@ -1896,7 +2213,6 @@ void P210ToARGBRow_SVE2(const uint16_t* src_y,
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
- width_last_y = width_last_y == 0 ? vl : width_last_y;
int width_last_uv = width_last_y + (width_last_y & 1);
uint32_t nv_uv_start = 0x03010301U;
uint32_t nv_uv_step = 0x04040404U;
@@ -1951,7 +2267,6 @@ void P210ToAR30Row_SVE2(const uint16_t* src_y,
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
- width_last_y = width_last_y == 0 ? vl : width_last_y;
int width_last_uv = width_last_y + (width_last_y & 1);
uint32_t nv_uv_start = 0x03010301U;
uint32_t nv_uv_step = 0x04040404U;
@@ -1998,6 +2313,150 @@ void P210ToAR30Row_SVE2(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
+void I410ToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "dup z19.b, #255 \n" // A
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" //
+ READI410_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" //
+ READI410_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y) // %[width_last_y]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
+void I410AlphaToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "cmp %w[width], %w[vl] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" READI410_SVE
+ "ld1h {z19.h}, p1/z, [%[src_a]] \n" I4XXTORGB_SVE
+ "incb %[src_a] \n" RGBATOARGB8_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" READI410_SVE
+ "ld1h {z19.h}, p1/z, [%[src_a]] \n" //
+ I4XXTORGB_SVE RGBATOARGB8_SVE
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y) // %[width_last_y]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
+void I410ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ uint16_t limit = 0x3ff0;
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "dup z23.h, %w[limit] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" //
+ READI410_SVE I4XXTORGB_SVE STOREAR30_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" //
+ READI410_SVE I4XXTORGB_SVE STOREAR30_SVE
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_ar30] "+r"(dst_ar30), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [limit] "r"(limit) // %[limit]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
void P410ToARGBRow_SVE2(const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
@@ -2006,7 +2465,6 @@ void P410ToARGBRow_SVE2(const uint16_t* src_y,
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
- width_last_y = width_last_y == 0 ? vl : width_last_y;
asm volatile(
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A
@@ -2057,7 +2515,6 @@ void P410ToAR30Row_SVE2(const uint16_t* src_y,
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
- width_last_y = width_last_y == 0 ? vl : width_last_y;
uint16_t limit = 0x3ff0;
asm volatile(
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
@@ -2100,6 +2557,99 @@ void P410ToAR30Row_SVE2(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
+void I212ToAR30Row_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ uint16_t limit = 0x3ff0;
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "dup z23.h, %w[limit] \n"
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" //
+ READI212_SVE I4XXTORGB_SVE STOREAR30_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" //
+ READI212_SVE I4XXTORGB_SVE STOREAR30_SVE
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y), // %[width_last_y]
+ [limit] "r"(limit) // %[limit]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
+void I212ToARGBRow_SVE2(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t vl;
+ asm("cnth %0" : "=r"(vl));
+ int width_last_y = width & (vl - 1);
+ asm volatile(
+ "ptrue p0.b \n" YUVTORGB_SVE_SETUP
+ "dup z19.b, #255 \n" // A
+ "subs %w[width], %w[width], %w[vl] \n"
+ "b.lt 2f \n"
+
+ // Run bulk of computation with an all-true predicate to avoid predicate
+ // generation overhead.
+ "ptrue p1.h \n"
+ "1: \n" //
+ READI212_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
+ "subs %w[width], %w[width], %w[vl] \n"
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+ "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[width], %w[width], %w[vl] \n"
+ "b.eq 99f \n"
+
+ // Calculate a predicate for the final iteration to deal with the tail.
+ "whilelt p1.h, wzr, %w[width_last_y] \n" //
+ READI212_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
+ "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
+
+ "99: \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [vl] "r"(vl), // %[vl]
+ [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [width_last_y] "r"(width_last_y) // %[width_last_y]
+ : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
index 661224166..8b8315043 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale.cc
@@ -188,8 +188,9 @@ static void ScalePlaneDown2_16(int src_width,
#if defined(HAS_SCALEROWDOWN2_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 =
- filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_NEON
+ : filtering == kFilterLinear ? ScaleRowDown2Linear_16_NEON
+ : ScaleRowDown2Box_16_NEON;
}
#endif
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
diff --git a/libfenrir/src/main/jni/animation/libyuv/source/scale_neon64.cc b/libfenrir/src/main/jni/animation/libyuv/source/scale_neon64.cc
index c125c6c09..69c51b1bb 100644
--- a/libfenrir/src/main/jni/animation/libyuv/source/scale_neon64.cc
+++ b/libfenrir/src/main/jni/animation/libyuv/source/scale_neon64.cc
@@ -1354,6 +1354,71 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef SCALE_ARGB_FILTER_COLS_STEP_ADDR
+void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "subs %w[dst_width], %w[dst_width], #32 \n"
+ "b.lt 2f \n"
+
+ "1: \n"
+ "ldp q0, q1, [%[src_ptr]] \n"
+ "ldp q2, q3, [%[src_ptr], #32] \n"
+ "ldp q4, q5, [%[src_ptr], #64] \n"
+ "ldp q6, q7, [%[src_ptr], #96] \n"
+ "add %[src_ptr], %[src_ptr], #128 \n"
+ "uzp2 v0.8h, v0.8h, v1.8h \n"
+ "uzp2 v1.8h, v2.8h, v3.8h \n"
+ "uzp2 v2.8h, v4.8h, v5.8h \n"
+ "uzp2 v3.8h, v6.8h, v7.8h \n"
+ "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
+ "stp q0, q1, [%[dst_ptr]] \n"
+ "stp q2, q3, [%[dst_ptr], #32] \n"
+ "add %[dst_ptr], %[dst_ptr], #64 \n"
+ "b.ge 1b \n"
+
+ "2: \n"
+ "adds %w[dst_width], %w[dst_width], #32 \n"
+ "b.eq 99f \n"
+
+ "ldp q0, q1, [%[src_ptr]] \n"
+ "ldp q2, q3, [%[src_ptr], #32] \n"
+ "uzp2 v0.8h, v0.8h, v1.8h \n"
+ "uzp2 v1.8h, v2.8h, v3.8h \n"
+ "stp q0, q1, [%[dst_ptr]] \n"
+
+ "99: \n"
+ : [src_ptr] "+r"(src_ptr), // %[src_ptr]
+ [dst_ptr] "+r"(dst), // %[dst_ptr]
+ [dst_width] "+r"(dst_width) // %[dst_width]
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
+ "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
+ "subs %w[dst_width], %w[dst_width], #16 \n"
+ "urhadd v0.8h, v0.8h, v1.8h \n"
+ "urhadd v1.8h, v2.8h, v3.8h \n"
+ "prfm pldl1keep, [%[src_ptr], 448] \n"
+ "stp q0, q1, [%[dst_ptr]], #32 \n"
+ "b.gt 1b \n"
+ : [src_ptr] "+r"(src_ptr), // %[src_ptr]
+ [dst_ptr] "+r"(dst), // %[dst_ptr]
+ [dst_width] "+r"(dst_width) // %[dst_width]
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
// Read 16x2 average down and write 8x1.
void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
diff --git a/libfenrir/src/main/jni/animation/thorvg/inc/colorreplace.h b/libfenrir/src/main/jni/animation/thorvg/inc/colorreplace.h
index 4033898f0..a9727801a 100644
--- a/libfenrir/src/main/jni/animation/thorvg/inc/colorreplace.h
+++ b/libfenrir/src/main/jni/animation/thorvg/inc/colorreplace.h
@@ -1,5 +1,6 @@
#ifndef _ColorReplace_H_
#define _ColorReplace_H_
+
#include