lavfi: add nlmeans_vulkan filter
This commit is contained in:
parent
dfff3877b7
commit
160a415e22
1
configure
vendored
1
configure
vendored
@ -3705,6 +3705,7 @@ minterpolate_filter_select="scene_sad"
|
||||
mptestsrc_filter_deps="gpl"
|
||||
negate_filter_deps="lut_filter"
|
||||
nlmeans_opencl_filter_deps="opencl"
|
||||
nlmeans_vulkan_filter_deps="vulkan spirv_compiler"
|
||||
nnedi_filter_deps="gpl"
|
||||
ocr_filter_deps="libtesseract"
|
||||
ocv_filter_deps="libopencv"
|
||||
|
@ -390,6 +390,8 @@ OBJS-$(CONFIG_MULTIPLY_FILTER) += vf_multiply.o
|
||||
OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o
|
||||
OBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o
|
||||
OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
|
||||
OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \
|
||||
vulkan/prefix_sum.o
|
||||
OBJS-$(CONFIG_NNEDI_FILTER) += vf_nnedi.o
|
||||
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
|
||||
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o
|
||||
|
@ -368,6 +368,7 @@ extern const AVFilter ff_vf_multiply;
|
||||
extern const AVFilter ff_vf_negate;
|
||||
extern const AVFilter ff_vf_nlmeans;
|
||||
extern const AVFilter ff_vf_nlmeans_opencl;
|
||||
extern const AVFilter ff_vf_nlmeans_vulkan;
|
||||
extern const AVFilter ff_vf_nnedi;
|
||||
extern const AVFilter ff_vf_noformat;
|
||||
extern const AVFilter ff_vf_noise;
|
||||
|
1122
libavfilter/vf_nlmeans_vulkan.c
Normal file
1122
libavfilter/vf_nlmeans_vulkan.c
Normal file
File diff suppressed because it is too large
Load Diff
151
libavfilter/vulkan/prefix_sum.comp
Normal file
151
libavfilter/vulkan/prefix_sum.comp
Normal file
@ -0,0 +1,151 @@
|
||||
#extension GL_EXT_buffer_reference : require
|
||||
#extension GL_EXT_buffer_reference2 : require
|
||||
|
||||
#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
|
||||
#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
|
||||
|
||||
// These correspond to X, A, P respectively in the prefix sum paper.
|
||||
#define FLAG_NOT_READY 0u
|
||||
#define FLAG_AGGREGATE_READY 1u
|
||||
#define FLAG_PREFIX_READY 2u
|
||||
|
||||
layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData {
|
||||
DTYPE aggregate;
|
||||
DTYPE prefix;
|
||||
uint flag;
|
||||
};
|
||||
|
||||
shared DTYPE sh_scratch[WG_SIZE];
|
||||
shared DTYPE sh_prefix;
|
||||
shared uint sh_part_ix;
|
||||
shared uint sh_flag;
|
||||
|
||||
void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride)
|
||||
{
|
||||
DTYPE local[N_ROWS];
|
||||
// Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper).
|
||||
if (gl_GlobalInvocationID.x == 0)
|
||||
sh_part_ix = gl_WorkGroupID.x;
|
||||
// sh_part_ix = atomicAdd(part_counter, 1);
|
||||
|
||||
barrier();
|
||||
uint part_ix = sh_part_ix;
|
||||
|
||||
uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
||||
|
||||
// TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better)
|
||||
local[0] = src.v[ix*src_stride];
|
||||
for (uint i = 1; i < N_ROWS; i++)
|
||||
local[i] = local[i - 1] + src.v[(ix + i)*src_stride];
|
||||
|
||||
DTYPE agg = local[N_ROWS - 1];
|
||||
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||||
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
||||
barrier();
|
||||
if (gl_LocalInvocationID.x >= (1u << i))
|
||||
agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)];
|
||||
barrier();
|
||||
|
||||
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||||
}
|
||||
|
||||
// Publish aggregate for this partition
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
state[part_ix].aggregate = agg;
|
||||
if (part_ix == 0)
|
||||
state[0].prefix = agg;
|
||||
}
|
||||
|
||||
// Write flag with release semantics
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY;
|
||||
atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
|
||||
}
|
||||
|
||||
DTYPE exclusive = DTYPE(0);
|
||||
if (part_ix != 0) {
|
||||
// step 4 of paper: decoupled lookback
|
||||
uint look_back_ix = part_ix - 1;
|
||||
|
||||
DTYPE their_agg;
|
||||
uint their_ix = 0;
|
||||
while (true) {
|
||||
// Read flag with acquire semantics.
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1)
|
||||
sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
|
||||
|
||||
// The flag load is done only in the last thread. However, because the
|
||||
// translation of memoryBarrierBuffer to Metal requires uniform control
|
||||
// flow, we broadcast it to all threads.
|
||||
barrier();
|
||||
|
||||
uint flag = sh_flag;
|
||||
barrier();
|
||||
|
||||
if (flag == FLAG_PREFIX_READY) {
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
DTYPE their_prefix = state[look_back_ix].prefix;
|
||||
exclusive = their_prefix + exclusive;
|
||||
}
|
||||
break;
|
||||
} else if (flag == FLAG_AGGREGATE_READY) {
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
their_agg = state[look_back_ix].aggregate;
|
||||
exclusive = their_agg + exclusive;
|
||||
}
|
||||
look_back_ix--;
|
||||
their_ix = 0;
|
||||
continue;
|
||||
} // else spins
|
||||
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
// Unfortunately there's no guarantee of forward progress of other
|
||||
// workgroups, so compute a bit of the aggregate before trying again.
|
||||
// In the worst case, spinning stops when the aggregate is complete.
|
||||
DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride];
|
||||
if (their_ix == 0)
|
||||
their_agg = m;
|
||||
else
|
||||
their_agg += m;
|
||||
|
||||
their_ix++;
|
||||
if (their_ix == PARTITION_SIZE) {
|
||||
exclusive = their_agg + exclusive;
|
||||
if (look_back_ix == 0) {
|
||||
sh_flag = FLAG_PREFIX_READY;
|
||||
} else {
|
||||
look_back_ix--;
|
||||
their_ix = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
flag = sh_flag;
|
||||
barrier();
|
||||
if (flag == FLAG_PREFIX_READY)
|
||||
break;
|
||||
}
|
||||
|
||||
// step 5 of paper: compute inclusive prefix
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
DTYPE inclusive_prefix = exclusive + agg;
|
||||
sh_prefix = exclusive;
|
||||
state[part_ix].prefix = inclusive_prefix;
|
||||
}
|
||||
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1)
|
||||
atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
|
||||
}
|
||||
|
||||
barrier();
|
||||
if (part_ix != 0)
|
||||
exclusive = sh_prefix;
|
||||
|
||||
DTYPE row = exclusive;
|
||||
if (gl_LocalInvocationID.x > 0)
|
||||
row += sh_scratch[gl_LocalInvocationID.x - 1];
|
||||
|
||||
// note - may overwrite
|
||||
for (uint i = 0; i < N_ROWS; i++)
|
||||
dst.v[(ix + i)*dst_stride] = row + local[i];
|
||||
}
|
@ -133,6 +133,7 @@ typedef enum FFVulkanExtensions {
|
||||
MACRO(1, 1, FF_VK_EXT_NO_FLAG, CreateBuffer) \
|
||||
MACRO(1, 1, FF_VK_EXT_NO_FLAG, BindBufferMemory) \
|
||||
MACRO(1, 1, FF_VK_EXT_NO_FLAG, GetBufferDeviceAddress) \
|
||||
MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdFillBuffer) \
|
||||
MACRO(1, 1, FF_VK_EXT_NO_FLAG, DestroyBuffer) \
|
||||
\
|
||||
/* Image */ \
|
||||
|
Loading…
x
Reference in New Issue
Block a user