diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h index 56254b5f99..66b93c1c74 100644 --- a/externals/sse2neon/sse2neon.h +++ b/externals/sse2neon/sse2neon.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors +// SPDX-License-Identifier: MIT + #ifndef SSE2NEON_H #define SSE2NEON_H diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index dc3504f53b..3ad56bb80c 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -248,16 +248,19 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, #endif #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + const auto alpha_linear{static_cast(slot.config.planar_alpha.Value())}; const auto alpha = _mm_slli_epi64(_mm_set1_epi64x(static_cast(slot.config.planar_alpha.Value())), 48); const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); + const auto sse_aligned_width = Common::AlignDown(in_luma_width, 16); for (s32 y = 0; y < in_luma_height; y++) { const auto src_luma{y * in_luma_stride}; const auto src_chroma{(y / 2) * in_chroma_stride}; const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x += 16) { + s32 x = 0; + for (; x < sse_aligned_width; x += 16) { // clang-format off // Prefetch next iteration's memory _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); @@ -381,6 +384,23 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, // clang-format on } + + for (; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + // Chroma samples are duplicated horizontally and vertically. + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha_linear; + } } #else DecodeLinear(); @@ -827,11 +847,14 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); + const auto sse_aligned_width = Common::AlignDown(surface_width, 16); + for (u32 y = 0; y < surface_height; ++y) { const auto src = y * surface_stride; const auto dst_luma = y * out_luma_stride; const auto dst_chroma = (y / 2) * out_chroma_stride; - for (u32 x = 0; x < surface_width; x += 16) { + u32 x = 0; + for (; x < sse_aligned_width; x += 16) { // clang-format off // Prefetch the next cache lines, 2 per iteration _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); @@ -949,6 +972,16 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { // clang-format on } + + const auto src_chroma = y * surface_stride; + for (; x < surface_width; x += 2) { + out_luma[dst_luma + x + 0] = static_cast(output_surface[src + x + 0].r >> 2); + out_luma[dst_luma + x + 1] = static_cast(output_surface[src + x + 1].r >> 2); + out_chroma[dst_chroma + x + 0] = + static_cast(output_surface[src_chroma + x].g >> 2); + out_chroma[dst_chroma + x + 1] = + static_cast(output_surface[src_chroma + x].b >> 2); + } } #else DecodeLinear(out_luma, out_chroma); @@ -1083,10 +1116,14 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { #endif #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + constexpr size_t SseAlignment = 16; + const auto sse_aligned_width = Common::AlignDown(surface_width, SseAlignment); + for (u32 y = 0; y < surface_height; y++) { const auto src = y * surface_stride; const auto dst = y * out_luma_stride; - for (u32 x = 0; x < surface_width; x += 16) { + u32 x = 0; + for (; x < sse_aligned_width; x += SseAlignment) { // clang-format off // Prefetch the next 2 cache lines _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); @@ -1146,6 +1183,20 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { // clang-format on } + + for (; x < surface_width; x++) { + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } else { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } + } } #else DecodeLinear(out_buffer);