/* comb_detect_template.c
Copyright (c) 2003-2025 HandBrake Team
This file is part of the HandBrake source code
Homepage: .
It may be used under the terms of the GNU General Public License v2.
For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
*/
#if BIT_DEPTH > 8
# define pixel uint16_t
# define FUNC(name) name##_##16
#else
# define pixel uint8_t
# define FUNC(name) name##_##8
#endif
#if defined (__aarch64__) && !defined(__APPLE__)
#include
#endif
static inline void FUNC(draw_mask_box)(hb_filter_private_t *pv)
{
const int x = pv->mask_box_x;
const int y = pv->mask_box_y;
const int box_width = pv->block_width;
const int box_height = pv->block_height;
int stride;
uint8_t *mskp;
if (pv->mode & MODE_FILTER)
{
mskp = pv->mask_filtered->plane[0].data;
stride = pv->mask_filtered->plane[0].stride;
}
else
{
mskp = pv->mask->plane[0].data;
stride = pv->mask->plane[0].stride;
}
for (int block_x = 0; block_x < box_width; block_x++)
{
mskp[ y * stride + x + block_x] = 128;
mskp[(y + box_height) * stride + x + block_x] = 128;
}
for (int block_y = 0; block_y < box_height; block_y++)
{
mskp[stride * (y + block_y) + x ] = 128;
mskp[stride * (y + block_y) + x + box_width] = 128;
}
}
static inline void FUNC(apply_mask_line)(pixel *srcp,
const uint8_t *mskp,
const int width,
const int max,
const int half)
{
for (int x = 0; x < width; x++)
{
if (mskp[x] == 1)
{
srcp[x] = max;
}
else if (mskp[x] == 128)
{
srcp[x] = half;
}
}
}
static void FUNC(apply_mask)(hb_filter_private_t *pv, hb_buffer_t *b)
{
// Draw boxes
FUNC(draw_mask_box)(pv);
const hb_buffer_t *m;
const int max = pv->max_value;
const int half = pv->half_value;
if (pv->mode & MODE_FILTER)
{
m = pv->mask_filtered;
}
else
{
m = pv->mask;
}
for (int pp = 0; pp < 3; pp++)
{
pixel *dstp = (pixel *)b->plane[pp].data;
const int dstp_stride = b->plane[pp].stride / pv->bps;
const int width = m->plane[pp].width;
const int height = m->plane[pp].height;
if (!(pv->mode & MODE_COMPOSITE))
{
if (pp == 0)
{
memset(dstp, 0, b->plane[pp].size);
}
else
{
if (pv->depth == 8)
{
memset(dstp, half, b->plane[pp].size);
}
else
{
for (int i = 0; i < b->plane[pp].size / pv->bps; i++)
{
dstp[i] = half;
}
}
}
}
if (pp == 0)
{
const uint8_t *mskp = m->plane[0].data;
const int mskp_stride = m->plane[0].stride;
for (int yy = 0; yy < height; yy++)
{
FUNC(apply_mask_line)(dstp, mskp, width, max, half);
dstp += dstp_stride;
mskp += mskp_stride;
}
}
}
}
#if defined (__aarch64__) && !defined(__APPLE__)
static void FUNC(detect_gamma_combed_segment)(hb_filter_private_t *pv,
int segment_start, int segment_stop)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const float mthresh = pv->gamma_motion_threshold;
const float athresh = pv->gamma_spatial_threshold;
const float athresh6 = pv->gamma_spatial_threshold6;
// One pass for Y
const int stride_prev = pv->ref[0]->plane[0].stride / pv->bps;
const int stride_cur = pv->ref[1]->plane[0].stride / pv->bps;
const int stride_next = pv->ref[2]->plane[0].stride / pv->bps;
const int width = pv->ref[0]->plane[0].width;
const int height = pv->ref[0]->plane[0].height;
const int mask_stride = pv->mask->plane[0].stride;
// Comb detection has to start at y = 2 and end at
// y = height - 2, because it needs to examine
// 2 pixels above and 2 below the current pixel.
if (segment_start < 2)
{
segment_start = 2;
}
if (segment_stop > height - 2)
{
segment_stop = height - 2;
}
// These are just to make the buffer locations easier to read.
const int up_1_prev = -1 * stride_prev;
const int down_1_prev = stride_prev;
const int up_2 = -2 * stride_cur;
const int up_1 = -1 * stride_cur;
const int down_1 = stride_cur;
const int down_2 = 2 * stride_cur;
const int up_1_next = -1 * stride_next;
const int down_1_next = stride_next;
float32x4_t v_athresh = vdupq_n_f32(athresh);
float32x4_t v_athresh_neg = vdupq_n_f32(-athresh);
float32x4_t v_mthresh = vdupq_n_f32(mthresh);
float32x4_t v_athresh6 = vdupq_n_f32(athresh6);
float32x4_t v_four = vdupq_n_f32(4.0f);
float32x4_t v_three = vdupq_n_f32(3.0f);
uint32x4_t v_one = vdupq_n_u32(1);
uint32x4_t v_exhaustive_check = vdupq_n_u32(pv->force_exaustive_check);
for (int y = segment_start; y < segment_stop; y++)
{
// We need to examine a column of 5 pixels
// in the prev, cur, and next frames.
const pixel *prev = &((const pixel *)pv->ref[0]->plane[0].data)[y * stride_prev];
const pixel *cur = &((const pixel *)pv->ref[1]->plane[0].data)[y * stride_cur];
const pixel *next = &((const pixel *)pv->ref[2]->plane[0].data)[y * stride_next];
uint8_t *mask = &pv->mask->plane[0].data[y * mask_stride];
memset(mask, 0, mask_stride);
uint32_t mask32[4];
for (int x = 0; x < width; x += 4)
{
uint32x4_t mask_vec = vdupq_n_u32(0);
float32x4_t cur_vec = {pv->gamma_lut[cur[0]], pv->gamma_lut[cur[1]], pv->gamma_lut[cur[2]], pv->gamma_lut[cur[3]]};
float32x4_t cur_up1_vec = {pv->gamma_lut[cur[up_1 + 0]], pv->gamma_lut[cur[up_1 + 1]], pv->gamma_lut[cur[up_1 + 2]], pv->gamma_lut[cur[up_1 + 3]]};
float32x4_t cur_down1_vec = {pv->gamma_lut[cur[down_1 + 0]], pv->gamma_lut[cur[down_1 + 1]], pv->gamma_lut[cur[down_1 + 2]], pv->gamma_lut[cur[down_1 + 3]]};
float32x4_t up_diff1 = vsubq_f32(cur_vec, cur_up1_vec);
float32x4_t down_diff1 = vsubq_f32(cur_vec, cur_down1_vec);
uint32x4_t cond1 = vcgtq_f32(up_diff1, v_athresh);
uint32x4_t cond2 = vcgtq_f32(down_diff1, v_athresh);
uint32x4_t cond3 = vcltq_f32(up_diff1, v_athresh_neg);
uint32x4_t cond4 = vcltq_f32(down_diff1, v_athresh_neg);
uint32x4_t condition1 = vandq_u32(cond1, cond2);
uint32x4_t condition2 = vandq_u32(cond3, cond4);
uint32x4_t condition = vorrq_u32(condition1, condition2);
if(vmaxvq_u32(condition) > 0)
{
uint32x4_t motion = vdupq_n_u32(0);
uint32x4_t motion1 = vdupq_n_u32(0);
if (mthresh > 0)
{
float32x4_t prev_vec = {pv->gamma_lut[prev[0]], pv->gamma_lut[prev[1]], pv->gamma_lut[prev[2]], pv->gamma_lut[prev[3]]};
float32x4_t next_vec = {pv->gamma_lut[next[0]], pv->gamma_lut[next[1]], pv->gamma_lut[next[2]], pv->gamma_lut[next[3]]};
float32x4_t next_up_1_vec = {pv->gamma_lut[next[up_1_next + 0]], pv->gamma_lut[next[up_1_next + 1]], pv->gamma_lut[next[up_1_next + 2]], pv->gamma_lut[next[up_1_next + 3]]};
float32x4_t next_down_1_vec = {pv->gamma_lut[next[down_1_next + 0]], pv->gamma_lut[next[down_1_next + 1]], pv->gamma_lut[next[down_1_next + 2]], pv->gamma_lut[next[down_1_next + 3]]};
float32x4_t abs_diff1 = vabsq_f32(vsubq_f32(prev_vec, cur_vec));
float32x4_t abs_diff2 = vabsq_f32(vsubq_f32(cur_up1_vec, next_up_1_vec));
float32x4_t abs_diff3 = vabsq_f32(vsubq_f32(cur_down1_vec,next_down_1_vec));
uint32x4_t motion_cond1 = vcgtq_f32(abs_diff1, v_mthresh);
uint32x4_t motion_cond2 = vcgtq_f32(abs_diff2, v_mthresh);
uint32x4_t motion_cond3 = vcgtq_f32(abs_diff3, v_mthresh);
motion = vandq_u32(vandq_u32(motion_cond1, motion_cond2), motion_cond3);
float32x4_t prev_up_1_vec = {pv->gamma_lut[prev[up_1_prev + 0]], pv->gamma_lut[prev[up_1_prev + 1]], pv->gamma_lut[prev[up_1_prev + 2]], pv->gamma_lut[prev[up_1_prev + 3]]};
float32x4_t prev_down_1_vec = {pv->gamma_lut[prev[down_1_prev + 0]], pv->gamma_lut[prev[down_1_prev + 1]], pv->gamma_lut[prev[down_1_prev + 2]], pv->gamma_lut[prev[down_1_prev + 3]]};
float32x4_t abs_diff4 = vabsq_f32(vsubq_f32(next_vec, cur_vec));
float32x4_t abs_diff5 = vabsq_f32(vsubq_f32(prev_up_1_vec, cur_up1_vec));
float32x4_t abs_diff6 = vabsq_f32(vsubq_f32(prev_down_1_vec, cur_down1_vec));
uint32x4_t motion_cond4 = vcgtq_f32(abs_diff4, v_mthresh);
uint32x4_t motion_cond5 = vcgtq_f32(abs_diff5, v_mthresh);
uint32x4_t motion_cond6 = vcgtq_f32(abs_diff6, v_mthresh);
motion1 = vandq_u32(vandq_u32(motion_cond4, motion_cond5), motion_cond6);
motion = vorrq_u32(motion, motion1);
}
else
{
motion = vdupq_n_u32(1);
}
uint32x4_t motion_check = vorrq_u32(motion, v_exhaustive_check);
motion_check = vcgtq_u32(motion_check, mask_vec);
float32x4_t cur_up2_vec = {pv->gamma_lut[cur[up_2 + 0]], pv->gamma_lut[cur[up_2 + 1]], pv->gamma_lut[cur[up_2 + 2]], pv->gamma_lut[cur[up_2 + 3]]};
float32x4_t cur_down2_vec = {pv->gamma_lut[cur[down_2 + 0]], pv->gamma_lut[cur[down_2 + 1]], pv->gamma_lut[cur[down_2 + 2]], pv->gamma_lut[cur[down_2 + 3]]};
float32x4_t combing1 = vabsq_f32(vsubq_f32(vaddq_f32(vaddq_f32(cur_up2_vec, vmulq_f32(cur_vec, v_four)),cur_down2_vec), vmulq_f32(vaddq_f32(cur_up1_vec, cur_down1_vec), v_three)));
uint32x4_t combing_cond = vcgtq_f32(combing1, v_athresh6);
mask_vec = vandq_u32(combing_cond, motion_check);
mask_vec = vandq_u32(mask_vec, condition);
mask_vec = vandq_u32(mask_vec, v_one);
vst1q_u32(mask32, mask_vec);
mask[0] = mask32[0];
mask[1] = mask32[1];
mask[2] = mask32[2];
mask[3] = mask32[3];
}
cur+=4;
prev+=4;
next+=4;
mask+=4;
}
}
}
#else
static void FUNC(detect_gamma_combed_segment)(hb_filter_private_t *pv,
int segment_start, int segment_stop)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const float mthresh = pv->gamma_motion_threshold;
const float athresh = pv->gamma_spatial_threshold;
const float athresh6 = pv->gamma_spatial_threshold6;
// One pass for Y
const int stride_prev = pv->ref[0]->plane[0].stride / pv->bps;
const int stride_cur = pv->ref[1]->plane[0].stride / pv->bps;
const int stride_next = pv->ref[2]->plane[0].stride / pv->bps;
const int width = pv->ref[0]->plane[0].width;
const int height = pv->ref[0]->plane[0].height;
const int mask_stride = pv->mask->plane[0].stride;
// Comb detection has to start at y = 2 and end at
// y = height - 2, because it needs to examine
// 2 pixels above and 2 below the current pixel.
if (segment_start < 2)
{
segment_start = 2;
}
if (segment_stop > height - 2)
{
segment_stop = height - 2;
}
// These are just to make the buffer locations easier to read.
const int up_1_prev = -1 * stride_prev;
const int down_1_prev = stride_prev;
const int up_2 = -2 * stride_cur;
const int up_1 = -1 * stride_cur;
const int down_1 = stride_cur;
const int down_2 = 2 * stride_cur;
const int up_1_next = -1 * stride_next;
const int down_1_next = stride_next;
for (int y = segment_start; y < segment_stop; y++)
{
// We need to examine a column of 5 pixels
// in the prev, cur, and next frames.
const pixel *prev = &((const pixel *)pv->ref[0]->plane[0].data)[y * stride_prev];
const pixel *cur = &((const pixel *)pv->ref[1]->plane[0].data)[y * stride_cur];
const pixel *next = &((const pixel *)pv->ref[2]->plane[0].data)[y * stride_next];
uint8_t *mask = &pv->mask->plane[0].data[y * mask_stride];
memset(mask, 0, mask_stride);
for (int x = 0; x < width; x++)
{
const float up_diff = pv->gamma_lut[cur[0]] - pv->gamma_lut[cur[up_1]];
const float down_diff = pv->gamma_lut[cur[0]] - pv->gamma_lut[cur[down_1]];
if ((up_diff > athresh && down_diff > athresh) ||
(up_diff < -athresh && down_diff < -athresh))
{
// The pixel above and below are different,
// and they change in the same "direction" too.
int motion = 0;
if (mthresh > 0)
{
// Make sure there's sufficient motion between frame t-1 to frame t+1.
if (fabs(pv->gamma_lut[prev[0]] - pv->gamma_lut[cur[0]] ) > mthresh &&
fabs(pv->gamma_lut[cur[up_1]] - pv->gamma_lut[next[up_1_next]] ) > mthresh &&
fabs(pv->gamma_lut[cur[down_1]] - pv->gamma_lut[next[down_1_next]]) > mthresh)
{
motion++;
}
if (fabs(pv->gamma_lut[next[0]] - pv->gamma_lut[cur[0]] ) > mthresh &&
fabs(pv->gamma_lut[prev[up_1_prev]] - pv->gamma_lut[cur[up_1]] ) > mthresh &&
fabs(pv->gamma_lut[prev[down_1_prev]] - pv->gamma_lut[cur[down_1]]) > mthresh)
{
motion++;
}
}
else
{
// User doesn't want to check for motion,
// so move on to the spatial check.
motion = 1;
}
if (motion || pv->force_exaustive_check)
{
// Tritical's noise-resistant combing scorer.
// The check is done on a bob+blur convolution.
float combing = fabs(pv->gamma_lut[cur[up_2]] +
(4 * pv->gamma_lut[cur[0]]) +
pv->gamma_lut[cur[down_2]] -
(3 * (pv->gamma_lut[cur[up_1]] +
pv->gamma_lut[cur[down_1]])));
// If the frame is sufficiently combed,
// then mark it down on the mask as 1.
if (combing > athresh6)
{
mask[0] = 1;
}
}
}
cur++;
prev++;
next++;
mask++;
}
}
}
#endif
#if defined (__aarch64__) && !defined(__APPLE__)
#if BIT_DEPTH > 8
static void FUNC(detect_combed_segment)(hb_filter_private_t *pv,
int segment_start, int segment_stop)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const int spatial_metric = pv->spatial_metric;
const int mthresh = pv->motion_threshold;
const int athresh = pv->spatial_threshold;
const int athresh_squared = pv->spatial_threshold_squared;
const int athresh6 = pv->spatial_threshold6;
// One pass for Y
const int stride_prev = pv->ref[0]->plane[0].stride / pv->bps;
const int stride_cur = pv->ref[1]->plane[0].stride / pv->bps;
const int stride_next = pv->ref[2]->plane[0].stride / pv->bps;
const int width = pv->ref[0]->plane[0].width;
const int height = pv->ref[0]->plane[0].height;
const int mask_stride = pv->mask->plane[0].stride;
// Comb detection has to start at y = 2 and end at
// y = height - 2, because it needs to examine
// 2 pixels above and 2 below the current pixel.
if (segment_start < 2)
{
segment_start = 2;
}
if (segment_stop > height - 2)
{
segment_stop = height - 2;
}
// These are just to make the buffer locations easier to read.
const int up_1_prev = -1 * stride_prev;
const int down_1_prev = stride_prev;
const int up_2 = -2 * stride_cur;
const int up_1 = -1 * stride_cur;
const int down_1 = stride_cur;
const int down_2 = 2 * stride_cur;
const int up_1_next = -1 * stride_next;
const int down_1_next = stride_next;
int32x4_t v_athresh = vdupq_n_s32(athresh);
int32x4_t v_athresh_neg = vdupq_n_s32(-athresh);
int32x4_t v_mthresh = vdupq_n_s32(mthresh);
int32x4_t v_athresh6 = vdupq_n_s32(athresh6);
int32x4_t v_athresh_squared = vdupq_n_s32(athresh_squared);
int32x4_t v_four = vdupq_n_s32(4);
int32x4_t v_three = vdupq_n_s32(3);
uint32x4_t v_one = vdupq_n_u32(1);
int32x4_t v_c32detect_min = vdupq_n_s32(pv->comb32detect_min);
int32x4_t v_c32detect_max = vdupq_n_s32(pv->comb32detect_max);
uint32x4_t v_exhaustive_check = vdupq_n_u32(pv->force_exaustive_check);
for (int y = segment_start; y < segment_stop; y++)
{
// We need to examine a column of 5 pixels
// in the prev, cur, and next frames.
const pixel *prev = &((const pixel *)pv->ref[0]->plane[0].data)[y * stride_prev];
const pixel *cur = &((const pixel *)pv->ref[1]->plane[0].data)[y * stride_cur];
const pixel *next = &((const pixel *)pv->ref[2]->plane[0].data)[y * stride_next];
uint8_t *mask = &pv->mask->plane[0].data[y * mask_stride];
memset(mask, 0, mask_stride);
uint32_t mask32[4];
for (int x = 0; x < width; x+=4)
{
uint32x4_t mask_vec = vdupq_n_u32(0);
int32x4_t cur_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(cur)));
int32x4_t cur_up1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(cur + up_1)));
int32x4_t cur_down1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(cur + down_1)));
int32x4_t up_diff_vec = vsubq_s32(cur_vec, cur_up1_vec);
int32x4_t down_diff_vec = vsubq_s32(cur_vec, cur_down1_vec);
uint32x4_t cond1 = vcgtq_s32(up_diff_vec, v_athresh);
uint32x4_t cond2 = vcgtq_s32(down_diff_vec, v_athresh);
uint32x4_t cond3 = vcltq_s32(up_diff_vec, v_athresh_neg);
uint32x4_t cond4 = vcltq_s32(down_diff_vec, v_athresh_neg);
uint32x4_t condition1 = vandq_u32(cond1, cond2);
uint32x4_t condition2 = vandq_u32(cond3, cond4);
uint32x4_t condition = vorrq_u32(condition1, condition2);
if(vmaxvq_u32(condition) > 0)
{
uint32x4_t motion = vdupq_n_u32(0);
uint32x4_t motion1 = vdupq_n_u32(0);
if (mthresh > 0)
{
int32x4_t prev_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(prev)));
int32x4_t next_up1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(next + up_1_next)));
int32x4_t next_down1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(next + down_1_next)));
int32x4_t abs_diff1 = vabsq_s32(vsubq_s32(prev_vec, cur_vec));
int32x4_t abs_diff2 = vabsq_s32(vsubq_s32(cur_up1_vec, next_up1_vec));
int32x4_t abs_diff3 = vabsq_s32(vsubq_s32(cur_down1_vec, next_down1_vec));
uint32x4_t motion_cond1 = vcgtq_s32(abs_diff1, v_mthresh);
uint32x4_t motion_cond2 = vcgtq_s32(abs_diff2, v_mthresh);
uint32x4_t motion_cond3 = vcgtq_s32(abs_diff3, v_mthresh);
motion = vandq_u32(vandq_u32(motion_cond1, motion_cond2), motion_cond3);
int32x4_t next_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(next)));
int32x4_t prev_up1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(prev + up_1_prev)));
int32x4_t prev_down1_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(prev + down_1_prev)));
int32x4_t abs_diff4 = vabsq_s32(vsubq_s32(next_vec, cur_vec));
int32x4_t abs_diff5 = vabsq_s32(vsubq_s32(prev_up1_vec, cur_up1_vec));
int32x4_t abs_diff6 = vabsq_s32(vsubq_s32(prev_down1_vec, cur_down1_vec));
uint32x4_t motion_cond4 = vcgtq_s32(abs_diff4, v_mthresh);
uint32x4_t motion_cond5 = vcgtq_s32(abs_diff5, v_mthresh);
uint32x4_t motion_cond6 = vcgtq_s32(abs_diff6, v_mthresh);
motion1 = vandq_u32(vandq_u32(motion_cond4, motion_cond5), motion_cond6);
motion = vorrq_u32(motion, motion1);
}
else
{
motion = vdupq_n_u32(1);
}
uint32x4_t motion_check = vorrq_u32(motion, v_exhaustive_check);
motion_check = vcgtq_u32(motion_check, mask_vec);
int32x4_t cur_up2_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(cur + up_2)));
int32x4_t cur_down2_vec = vreinterpretq_s32_u32(vmovl_u16(vld1_u16(cur + down_2)));
switch(spatial_metric)
{
case 0:
{
uint32x4_t cond_c32_detect_min = vcltq_s32(vabsq_s32(vsubq_s32(cur_vec, cur_down2_vec)), v_c32detect_min);
uint32x4_t cond_c32_detect_max = vcgtq_s32(vabsq_s32(vsubq_s32(cur_vec, cur_down1_vec)), v_c32detect_max);
uint32x4_t s_metric_0_vec = vandq_u32(cond_c32_detect_min, cond_c32_detect_max);
mask_vec = vandq_u32(s_metric_0_vec, motion_check);
mask_vec = vandq_u32(mask_vec, condition);
mask_vec = vandq_u32(mask_vec, v_one);
vst1q_u32(mask32, mask_vec);
mask[0] = mask32[0];
mask[1] = mask32[1];
mask[2] = mask32[2];
mask[3] = mask32[3];
break;
}
case 1:
{
int32x4_t s_metric_1_diff1 = vsubq_s32(cur_up1_vec, cur_vec);
int32x4_t s_metric_1_diff2 = vsubq_s32(cur_down1_vec, cur_vec);
int32x4_t s_metric_1_mul = vmulq_s32(s_metric_1_diff1, s_metric_1_diff2);
uint32x4_t s_metric_1_vec = vcgtq_s32(s_metric_1_mul, v_athresh_squared);
mask_vec = vandq_u32(s_metric_1_vec, motion_check);
mask_vec = vandq_u32(mask_vec, condition);
mask_vec = vandq_u32(mask_vec, v_one);
vst1q_u32(mask32, mask_vec);
mask[0] = mask32[0];
mask[1] = mask32[1];
mask[2] = mask32[2];
mask[3] = mask32[3];
break;
}
case 2:
{
int32x4_t combing1 = vabsq_s32(vsubq_s32(vaddq_s32(vaddq_s32(cur_up2_vec, vmulq_s32(cur_vec, v_four)),cur_down2_vec), vmulq_s32(vaddq_s32(cur_up1_vec, cur_down1_vec), v_three)));
uint32x4_t s_metric_2_vec = vcgtq_s32(combing1, v_athresh6);
mask_vec = vandq_u32(s_metric_2_vec, motion_check);
mask_vec = vandq_u32(mask_vec, condition);
mask_vec = vandq_u32(mask_vec, v_one);
vst1q_u32(mask32, mask_vec);
mask[0] = mask32[0];
mask[1] = mask32[1];
mask[2] = mask32[2];
mask[3] = mask32[3];
break;
}
}
}
cur+=4;
prev+=4;
next+=4;
mask+=4;
}
}
}
#else
static void FUNC(detect_combed_segment)(hb_filter_private_t *pv,
int segment_start, int segment_stop)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const int spatial_metric = pv->spatial_metric;
const int mthresh = pv->motion_threshold;
const int athresh = pv->spatial_threshold;
const int athresh_squared = pv->spatial_threshold_squared;
const int athresh6 = pv->spatial_threshold6;
// One pass for Y
const int stride_prev = pv->ref[0]->plane[0].stride / pv->bps;
const int stride_cur = pv->ref[1]->plane[0].stride / pv->bps;
const int stride_next = pv->ref[2]->plane[0].stride / pv->bps;
const int width = pv->ref[0]->plane[0].width;
const int height = pv->ref[0]->plane[0].height;
const int mask_stride = pv->mask->plane[0].stride;
// Comb detection has to start at y = 2 and end at
// y = height - 2, because it needs to examine
// 2 pixels above and 2 below the current pixel.
if (segment_start < 2)
{
segment_start = 2;
}
if (segment_stop > height - 2)
{
segment_stop = height - 2;
}
// These are just to make the buffer locations easier to read.
const int up_1_prev = -1 * stride_prev;
const int down_1_prev = stride_prev;
const int up_2 = -2 * stride_cur;
const int up_1 = -1 * stride_cur;
const int down_1 = stride_cur;
const int down_2 = 2 * stride_cur;
const int up_1_next = -1 * stride_next;
const int down_1_next = stride_next;
int16x8_t v_athresh = vdupq_n_s16(athresh);
int16x8_t v_athresh_neg = vdupq_n_s16(-athresh);
int16x8_t v_mthresh = vdupq_n_s16(mthresh);
int16x8_t v_athresh6 = vdupq_n_s16(athresh6);
int16x8_t v_athresh_squared = vdupq_n_s16(athresh_squared);
int16x8_t v_four = vdupq_n_s16(4);
int16x8_t v_three = vdupq_n_s16(3);
uint16x8_t v_one = vdupq_n_u16(1);
int16x8_t v_c32detect_min = vdupq_n_s16(pv->comb32detect_min);
int16x8_t v_c32detect_max = vdupq_n_s16(pv->comb32detect_max);
uint16x8_t v_exhaustive_check = vdupq_n_u16(pv->force_exaustive_check);
for (int y = segment_start; y < segment_stop; y++)
{
// We need to examine a column of 5 pixels
// in the prev, cur, and next frames.
const pixel *prev = &((const pixel *)pv->ref[0]->plane[0].data)[y * stride_prev];
const pixel *cur = &((const pixel *)pv->ref[1]->plane[0].data)[y * stride_cur];
const pixel *next = &((const pixel *)pv->ref[2]->plane[0].data)[y * stride_next];
uint8_t *mask = &pv->mask->plane[0].data[y * mask_stride];
memset(mask, 0, mask_stride);
for (int x = 0; x < width; x+=8)
{
uint16x8_t mask_vec = vdupq_n_u16(0);
int16x8_t cur_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)cur)));
int16x8_t cur_up1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)cur + up_1)));
int16x8_t cur_down1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)cur + down_1)));
int16x8_t up_diff_vec = vsubq_s16(cur_vec, cur_up1_vec);
int16x8_t down_diff_vec = vsubq_s16(cur_vec, cur_down1_vec);
uint16x8_t cond1 = vcgtq_s16(up_diff_vec, v_athresh);
uint16x8_t cond2 = vcgtq_s16(down_diff_vec, v_athresh);
uint16x8_t cond3 = vcltq_s16(up_diff_vec, v_athresh_neg);
uint16x8_t cond4 = vcltq_s16(down_diff_vec, v_athresh_neg);
uint16x8_t condition1 = vandq_u16(cond1, cond2);
uint16x8_t condition2 = vandq_u16(cond3, cond4);
uint16x8_t condition = vorrq_u16(condition1, condition2);
if(vmaxvq_u16(condition) > 0)
{
uint16x8_t motion = vdupq_n_u16(0);
uint16x8_t motion1 = vdupq_n_u16(0);
if (mthresh > 0)
{
int16x8_t prev_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)prev)));
int16x8_t next_up1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)next + up_1_next)));
int16x8_t next_down1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)next + down_1_next)));
int16x8_t abs_diff1 = vabsq_s16(vsubq_s16(prev_vec, cur_vec));
int16x8_t abs_diff2 = vabsq_s16(vsubq_s16(cur_up1_vec, next_up1_vec));
int16x8_t abs_diff3 = vabsq_s16(vsubq_s16(cur_down1_vec, next_down1_vec));
uint16x8_t motion_cond1 = vcgtq_s16(abs_diff1, v_mthresh);
uint16x8_t motion_cond2 = vcgtq_s16(abs_diff2, v_mthresh);
uint16x8_t motion_cond3 = vcgtq_s16(abs_diff3, v_mthresh);
motion = vandq_u16(vandq_u16(motion_cond1, motion_cond2), motion_cond3);
int16x8_t next_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)next)));
int16x8_t prev_up1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)prev + up_1_prev)));
int16x8_t prev_down1_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)prev + down_1_prev)));
int16x8_t abs_diff4 = vabsq_s16(vsubq_s16(next_vec, cur_vec));
int16x8_t abs_diff5 = vabsq_s16(vsubq_s16(prev_up1_vec, cur_up1_vec));
int16x8_t abs_diff6 = vabsq_s16(vsubq_s16(prev_down1_vec, cur_down1_vec));
uint16x8_t motion_cond4 = vcgtq_s16(abs_diff4, v_mthresh);
uint16x8_t motion_cond5 = vcgtq_s16(abs_diff5, v_mthresh);
uint16x8_t motion_cond6 = vcgtq_s16(abs_diff6, v_mthresh);
motion1 = vandq_u16(vandq_u16(motion_cond4, motion_cond5), motion_cond6);
motion = vorrq_u16(motion, motion1);
}
else
{
motion = vdupq_n_u16(1);
}
uint16x8_t motion_check = vorrq_u16(motion, v_exhaustive_check);
motion_check = vcgtq_u16(motion_check, mask_vec);
int16x8_t cur_up2_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)cur + up_2)));
int16x8_t cur_down2_vec = vreinterpretq_s16_u16(vmovl_u8(vld1_u8((uint8_t*)cur + down_2)));
switch(spatial_metric)
{
case 0:
{
uint16x8_t cond_c32_detect_min = vcltq_s16(vabsq_s16(vsubq_s16(cur_vec, cur_down2_vec)), v_c32detect_min);
uint16x8_t cond_c32_detect_max = vcgtq_s16(vabsq_s16(vsubq_s16(cur_vec, cur_down1_vec)), v_c32detect_max);
uint16x8_t s_metric_0_vec = vandq_u16(cond_c32_detect_min, cond_c32_detect_max);
mask_vec = vandq_u16(s_metric_0_vec, motion_check);
mask_vec = vandq_u16(mask_vec, condition);
mask_vec = vandq_u16(mask_vec, v_one);
vst1_u8(&mask[x], vmovn_u16(mask_vec));
break;
}
case 1:
{
int16x8_t s_metric_1_diff1 = vsubq_s16(cur_up1_vec, cur_vec);
int16x8_t s_metric_1_diff2 = vsubq_s16(cur_down1_vec, cur_vec);
int16x8_t s_metric_1_mul = vmulq_s16(s_metric_1_diff1, s_metric_1_diff2);
uint16x8_t s_metric_1_vec = vcgtq_s16(s_metric_1_mul, v_athresh_squared);
mask_vec = vandq_u16(s_metric_1_vec, motion_check);
mask_vec = vandq_u16(mask_vec, condition);
mask_vec = vandq_u16(mask_vec, v_one);
vst1_u8(&mask[x], vmovn_u16(mask_vec));
break;
}
case 2:
{
int16x8_t combing1 = vabsq_s16(vsubq_s16(vaddq_s16(vaddq_s16(cur_up2_vec, vmulq_s16(cur_vec, v_four)),cur_down2_vec), vmulq_s16(vaddq_s16(cur_up1_vec, cur_down1_vec), v_three)));
uint16x8_t s_metric_2_vec = vcgtq_s16(combing1, v_athresh6);
mask_vec = vandq_u16(s_metric_2_vec, motion_check);
mask_vec = vandq_u16(mask_vec, condition);
mask_vec = vandq_u16(mask_vec, v_one);
vst1_u8(&mask[x], vmovn_u16(mask_vec));
break;
}
}
}
cur+=8;
prev+=8;
next+=8;
}
}
}
#endif
#else
static void FUNC(detect_combed_segment)(hb_filter_private_t *pv,
int segment_start, int segment_stop)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const int spatial_metric = pv->spatial_metric;
const int mthresh = pv->motion_threshold;
const int athresh = pv->spatial_threshold;
const int athresh_squared = pv->spatial_threshold_squared;
const int athresh6 = pv->spatial_threshold6;
// One pass for Y
const int stride_prev = pv->ref[0]->plane[0].stride / pv->bps;
const int stride_cur = pv->ref[1]->plane[0].stride / pv->bps;
const int stride_next = pv->ref[2]->plane[0].stride / pv->bps;
const int width = pv->ref[0]->plane[0].width;
const int height = pv->ref[0]->plane[0].height;
const int mask_stride = pv->mask->plane[0].stride;
// Comb detection has to start at y = 2 and end at
// y = height - 2, because it needs to examine
// 2 pixels above and 2 below the current pixel.
if (segment_start < 2)
{
segment_start = 2;
}
if (segment_stop > height - 2)
{
segment_stop = height - 2;
}
// These are just to make the buffer locations easier to read.
const int up_1_prev = -1 * stride_prev;
const int down_1_prev = stride_prev;
const int up_2 = -2 * stride_cur;
const int up_1 = -1 * stride_cur;
const int down_1 = stride_cur;
const int down_2 = 2 * stride_cur;
const int up_1_next = -1 * stride_next;
const int down_1_next = stride_next;
for (int y = segment_start; y < segment_stop; y++)
{
// We need to examine a column of 5 pixels
// in the prev, cur, and next frames.
const pixel *prev = &((const pixel *)pv->ref[0]->plane[0].data)[y * stride_prev];
const pixel *cur = &((const pixel *)pv->ref[1]->plane[0].data)[y * stride_cur];
const pixel *next = &((const pixel *)pv->ref[2]->plane[0].data)[y * stride_next];
uint8_t *mask = &pv->mask->plane[0].data[y * mask_stride];
memset(mask, 0, mask_stride);
for (int x = 0; x < width; x++)
{
const int up_diff = cur[0] - cur[up_1];
const int down_diff = cur[0] - cur[down_1];
if ((up_diff > athresh && down_diff > athresh) ||
(up_diff < -athresh && down_diff < -athresh))
{
// The pixel above and below are different,
// and they change in the same "direction" too.
int motion = 0;
if (mthresh > 0)
{
// Make sure there's sufficient motion between frame t-1 to frame t+1.
if (abs(prev[0] - cur[0] ) > mthresh &&
abs(cur[up_1] - next[up_1_next] ) > mthresh &&
abs(cur[down_1] - next[down_1_next]) > mthresh)
{
motion++;
}
if (abs(next[0] - cur[0] ) > mthresh &&
abs(prev[up_1_prev] - cur[up_1] ) > mthresh &&
abs(prev[down_1_prev] - cur[down_1]) > mthresh)
{
motion++;
}
}
else
{
// User doesn't want to check for motion,
// so move on to the spatial check.
motion = 1;
}
// If motion, or we can't measure motion yet...
if (motion || pv->force_exaustive_check)
{
// That means it's time for the spatial check.
// We've got several options here.
if (spatial_metric == 0)
{
// Simple 32detect style comb detection.
if ((abs(cur[0] - cur[down_2]) < pv->comb32detect_min) &&
(abs(cur[0] - cur[down_1]) > pv->comb32detect_max))
{
mask[0] = 1;
}
}
else if (spatial_metric == 1)
{
// This, for comparison, is what IsCombed uses.
// It's better, but still noise sensitive.
const int combing = (cur[up_1] - cur[0]) *
(cur[down_1] - cur[0]);
if (combing > athresh_squared)
{
mask[0] = 1;
}
}
else if (spatial_metric == 2)
{
// Tritical's noise-resistant combing scorer.
// The check is done on a bob+blur convolution.
const int combing = abs( cur[up_2]
+ ( 4 * cur[0] )
+ cur[down_2]
- ( 3 * ( cur[up_1]
+ cur[down_1] ) ) );
// If the frame is sufficiently combed,
// then mark it down on the mask as 1.
if (combing > athresh6)
{
mask[0] = 1;
}
}
}
}
cur++;
prev++;
next++;
mask++;
}
}
}
#endif
#undef pixel
#undef FUNC