/* nlmeans.c Copyright (c) 2013 Dirk Farin Copyright (c) 2003-2025 HandBrake Team This file is part of the HandBrake source code Homepage: . It may be used under the terms of the GNU General Public License v2. For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html */ /* Usage * * Parameters: * lumaY_strength : lumaY_origin_tune : lumaY_patch_size : lumaY_range : lumaY_frames : lumaY_prefilter : * chromaB_strength : chromaB_origin_tune : chromaB_patch_size : chromaB_range : chromaB_frames : chromaB_prefilter : * chromaR_strength : chromaR_origin_tune : chromaR_patch_size : chromaR_range : chromaR_frames : chromaR_prefilter * * Defaults: * 8:1:7:3:2:0 for each channel (equivalent to 8:1:7:3:2:0:8:1:7:3:2:0:8:1:7:3:2:0) * * Parameters cascade, e.g. 6:0.8:7:3:3:0:4:1 sets: * strength 6, origin tune 0.8 for luma * patch size 7, range 3, frames 3, prefilter 0 for all channels * strength 4, origin tune 1 for both chroma channels * * Strength is relative and must be adjusted; ALL parameters affect overall strength. * Lower origin tune improves results for noisier input or animation (film 0.5-1, animation 0.15-0.5). * Large patch size (>9) may greatly reduce quality by clobbering detail. * Larger search range increases quality; however, computation time increases exponentially. * Large number of frames (film >3, animation >6) may cause temporal smearing. * Prefiltering can potentially improve weight decisions, yielding better results for difficult sources. * * Prefilter enum combos: * 1: Mean 3x3 * 2: Mean 5x5 * 3: Mean 5x5 (overrides Mean 3x3) * 257: Mean 3x3 reduced by 25% * 258: Mean 5x5 reduced by 25% * 513: Mean 3x3 reduced by 50% * 514: Mean 5x5 reduced by 50% * 769: Mean 3x3 reduced by 75% * 770: Mean 5x5 reduced by 75% * 1025: Mean 3x3 plus edge boost (restores lost edge detail) * 1026: Mean 5x5 plus edge boost * 1281: Mean 3x3 reduced by 25% plus edge boost * etc... * 2049: Mean 3x3 passthru (NLMeans off, prefilter is the output) * etc... * 3329: Mean 3x3 reduced by 25% plus edge boost, passthru * etc... */ #include "handbrake/handbrake.h" #include "handbrake/hbffmpeg.h" #include "handbrake/taskset.h" #include "handbrake/nlmeans.h" #define NLMEANS_STRENGTH_LUMA_DEFAULT 6 #define NLMEANS_STRENGTH_CHROMA_DEFAULT 6 #define NLMEANS_ORIGIN_TUNE_LUMA_DEFAULT 1 #define NLMEANS_ORIGIN_TUNE_CHROMA_DEFAULT 1 #define NLMEANS_PATCH_SIZE_LUMA_DEFAULT 7 #define NLMEANS_PATCH_SIZE_CHROMA_DEFAULT 7 #define NLMEANS_RANGE_LUMA_DEFAULT 3 #define NLMEANS_RANGE_CHROMA_DEFAULT 3 #define NLMEANS_FRAMES_LUMA_DEFAULT 2 #define NLMEANS_FRAMES_CHROMA_DEFAULT 2 #define NLMEANS_PREFILTER_LUMA_DEFAULT 0 #define NLMEANS_PREFILTER_CHROMA_DEFAULT 0 #define NLMEANS_PREFILTER_MODE_MEAN3X3 1 #define NLMEANS_PREFILTER_MODE_MEAN5X5 2 #define NLMEANS_PREFILTER_MODE_MEDIAN3X3 4 #define NLMEANS_PREFILTER_MODE_MEDIAN5X5 8 #define NLMEANS_PREFILTER_MODE_CSM3X3 16 #define NLMEANS_PREFILTER_MODE_CSM5X5 32 #define NLMEANS_PREFILTER_MODE_RESERVED64 64 // Reserved #define NLMEANS_PREFILTER_MODE_RESERVED128 128 // Reserved #define NLMEANS_PREFILTER_MODE_REDUCE25 256 #define NLMEANS_PREFILTER_MODE_REDUCE50 512 #define NLMEANS_PREFILTER_MODE_EDGEBOOST 1024 #define NLMEANS_PREFILTER_MODE_PASSTHRU 2048 #define NLMEANS_SORT(a,b) { if (a > b) NLMEANS_SWAP(a, b); } #define NLMEANS_SWAP(a,b) { a = (a ^ b); b = (a ^ b); a = (b ^ a); } #define NLMEANS_FRAMES_MAX 32 #define NLMEANS_EXPSIZE 128 typedef struct { void *mem; void *mem_pre; void *image; void *image_pre; int w; int h; int border; hb_lock_t *mutex; int prefiltered; } BorderedPlane; typedef struct { int width; int height; int fmt; BorderedPlane plane[3]; hb_buffer_t *buf; // input buf sidedata } Frame; struct PixelSum { float weight_sum; float pixel_sum; }; typedef struct { taskset_thread_arg_t arg; hb_filter_private_t *pv; hb_buffer_t *out; } nlmeans_thread_arg_t; struct hb_filter_private_s { int depth; int bps; int max_value; double strength[3]; // averaging weight decay, larger produces smoother output double origin_tune[3]; // weight tuning for origin patch, 0.00..1.00 int patch_size[3]; // pixel context region width (must be odd) int range[3]; // spatial search window width (must be odd) int nframes[3]; // temporal search depth in frames int prefilter[3]; // prefilter mode, can improve weight analysis int threads; // number of frame threads to use, 0 == auto float exptable[3][NLMEANS_EXPSIZE]; float weight_fact_table[3]; int diff_max[3]; NLMeansFunctions functions; void (*nlmeans_alloc)(const void *src, const int src_w, const int src_s, const int src_h, BorderedPlane *dst, const int border); void (*nlmeans_prefilter)(BorderedPlane *src, const int filter_type); void (*nlmeans_deborder)(const BorderedPlane *src, void *in_dst, const int w, const int s, const int h); void (*nlmeans_plane)(NLMeansFunctions *functions, Frame *frame, int prefilter, int plane, int nframes, void *dst, int dst_w, int dst_s, int dst_h, double h_param, double origin_tune, int n, int r, const float *exptable, const float weight_fact_table, const int diff_max); Frame *frame; int next_frame; int max_frames; taskset_t taskset; nlmeans_thread_arg_t ** thread_data; hb_filter_init_t input; hb_filter_init_t output; }; static int nlmeans_init(hb_filter_object_t *filter, hb_filter_init_t *init); static int nlmeans_work(hb_filter_object_t *filter, hb_buffer_t **buf_in, hb_buffer_t **buf_out); static void nlmeans_close(hb_filter_object_t *filter); static void nlmeans_filter_work(void *thread_args_v); static const char nlmeans_template[] = "y-strength=^"HB_FLOAT_REG"$:y-origin-tune=^"HB_FLOAT_REG"$:" "y-patch-size=^"HB_INT_REG"$:y-range=^"HB_INT_REG"$:" "y-frame-count=^"HB_INT_REG"$:y-prefilter=^"HB_INT_REG"$:" "cb-strength=^"HB_FLOAT_REG"$:cb-origin-tune=^"HB_FLOAT_REG"$:" "cb-patch-size=^"HB_INT_REG"$:cb-range=^"HB_INT_REG"$:" "cb-frame-count=^"HB_INT_REG"$:cb-prefilter=^"HB_INT_REG"$:" "cr-strength=^"HB_FLOAT_REG"$:cr-origin-tune=^"HB_FLOAT_REG"$:" "cr-patch-size=^"HB_INT_REG"$:cr-range=^"HB_INT_REG"$:" "cr-frame-count=^"HB_INT_REG"$:cr-prefilter=^"HB_INT_REG"$:" "threads=^"HB_INT_REG"$"; hb_filter_object_t hb_filter_nlmeans = { .id = HB_FILTER_NLMEANS, .enforce_order = 1, .name = "Denoise (nlmeans)", .settings = NULL, .init = nlmeans_init, .work = nlmeans_work, .close = nlmeans_close, .settings_template = nlmeans_template, }; #define BIT_DEPTH 8 #include "templates/nlmeans_template.c" #undef BIT_DEPTH #define BIT_DEPTH 16 #include "templates/nlmeans_template.c" #undef BIT_DEPTH static int nlmeans_init(hb_filter_object_t *filter, hb_filter_init_t *init) { filter->private_data = calloc(sizeof(struct hb_filter_private_s), 1); if (filter->private_data == NULL) { hb_error("nlmeans: calloc failed"); return -1; } hb_filter_private_t *pv = filter->private_data; NLMeansFunctions *functions = &pv->functions; pv->input = *init; const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(init->pix_fmt); pv->depth = desc->comp[0].depth; pv->bps = pv->depth > 8 ? 2 : 1; pv->max_value = (1 << pv->depth) - 1; switch (pv->depth) { case 8: functions->build_integral = build_integral_scalar_8; pv->nlmeans_alloc = nlmeans_alloc_8; pv->nlmeans_prefilter = nlmeans_prefilter_8; pv->nlmeans_deborder = nlmeans_deborder_8; pv->nlmeans_plane = nlmeans_plane_8; #if defined(ARCH_X86) nlmeans_init_x86(functions); #endif break; case 16: default: functions->build_integral = build_integral_scalar_16; pv->nlmeans_alloc = nlmeans_alloc_16; pv->nlmeans_prefilter = nlmeans_prefilter_16; pv->nlmeans_deborder = nlmeans_deborder_16; pv->nlmeans_plane = nlmeans_plane_16; break; } // Mark parameters unset for (int c = 0; c < 3; c++) { pv->strength[c] = -1; pv->origin_tune[c] = -1; pv->patch_size[c] = -1; pv->range[c] = -1; pv->nframes[c] = -1; pv->prefilter[c] = -1; } pv->threads = -1; // Read user parameters if (filter->settings != NULL) { hb_dict_t * dict = filter->settings; hb_dict_extract_double(&pv->strength[0], dict, "y-strength"); hb_dict_extract_double(&pv->origin_tune[0], dict, "y-origin-tune"); hb_dict_extract_int(&pv->patch_size[0], dict, "y-patch-size"); hb_dict_extract_int(&pv->range[0], dict, "y-range"); hb_dict_extract_int(&pv->nframes[0], dict, "y-frame-count"); hb_dict_extract_int(&pv->prefilter[0], dict, "y-prefilter"); hb_dict_extract_double(&pv->strength[1], dict, "cb-strength"); hb_dict_extract_double(&pv->origin_tune[1], dict, "cb-origin-tune"); hb_dict_extract_int(&pv->patch_size[1], dict, "cb-patch-size"); hb_dict_extract_int(&pv->range[1], dict, "cb-range"); hb_dict_extract_int(&pv->nframes[1], dict, "cb-frame-count"); hb_dict_extract_int(&pv->prefilter[1], dict, "cb-prefilter"); hb_dict_extract_double(&pv->strength[2], dict, "cr-strength"); hb_dict_extract_double(&pv->origin_tune[2], dict, "cr-origin-tune"); hb_dict_extract_int(&pv->patch_size[2], dict, "cr-patch-size"); hb_dict_extract_int(&pv->range[2], dict, "cr-range"); hb_dict_extract_int(&pv->nframes[2], dict, "cr-frame-count"); hb_dict_extract_int(&pv->prefilter[2], dict, "cr-prefilter"); hb_dict_extract_int(&pv->threads, dict, "threads"); } // Cascade values // Cr not set; inherit Cb. Cb not set; inherit Y. Y not set; defaults. for (int c = 1; c < 3; c++) { if (pv->strength[c] == -1) { pv->strength[c] = pv->strength[c-1]; } if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = pv->origin_tune[c-1]; } if (pv->patch_size[c] == -1) { pv->patch_size[c] = pv->patch_size[c-1]; } if (pv->range[c] == -1) { pv->range[c] = pv->range[c-1]; } if (pv->nframes[c] == -1) { pv->nframes[c] = pv->nframes[c-1]; } if (pv->prefilter[c] == -1) { pv->prefilter[c] = pv->prefilter[c-1]; } } for (int c = 0; c < 3; c++) { // Replace unset values with defaults if (pv->strength[c] == -1) { pv->strength[c] = c ? NLMEANS_STRENGTH_LUMA_DEFAULT : NLMEANS_STRENGTH_CHROMA_DEFAULT; } if (pv->origin_tune[c] == -1) { pv->origin_tune[c] = c ? NLMEANS_ORIGIN_TUNE_LUMA_DEFAULT : NLMEANS_ORIGIN_TUNE_CHROMA_DEFAULT; } if (pv->patch_size[c] == -1) { pv->patch_size[c] = c ? NLMEANS_PATCH_SIZE_LUMA_DEFAULT : NLMEANS_PATCH_SIZE_CHROMA_DEFAULT; } if (pv->range[c] == -1) { pv->range[c] = c ? NLMEANS_RANGE_LUMA_DEFAULT : NLMEANS_RANGE_CHROMA_DEFAULT; } if (pv->nframes[c] == -1) { pv->nframes[c] = c ? NLMEANS_FRAMES_LUMA_DEFAULT : NLMEANS_FRAMES_CHROMA_DEFAULT; } if (pv->prefilter[c] == -1) { pv->prefilter[c] = c ? NLMEANS_PREFILTER_LUMA_DEFAULT : NLMEANS_PREFILTER_CHROMA_DEFAULT; } // Sanitize if (pv->strength[c] < 0) { pv->strength[c] = 0; } if (pv->origin_tune[c] < 0.01) { pv->origin_tune[c] = 0.01; } // avoid black artifacts if (pv->origin_tune[c] > 1) { pv->origin_tune[c] = 1; } if (pv->patch_size[c] % 2 == 0) { pv->patch_size[c]--; } if (pv->patch_size[c] < 1) { pv->patch_size[c] = 1; } if (pv->range[c] % 2 == 0) { pv->range[c]--; } if (pv->range[c] < 1) { pv->range[c] = 1; } if (pv->nframes[c] < 1) { pv->nframes[c] = 1; } if (pv->nframes[c] > NLMEANS_FRAMES_MAX) { pv->nframes[c] = NLMEANS_FRAMES_MAX; } if (pv->prefilter[c] < 0) { pv->prefilter[c] = 0; } if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c]; // Scale strength with bit depth pv->strength[c] *= pv->depth > 8 ? (pv->depth - 8) * (pv->depth - 8) : 1; // Precompute exponential table float *exptable = &pv->exptable[c][0]; float *weight_fact_table = &pv->weight_fact_table[c]; int *diff_max = &pv->diff_max[c]; const float weight_factor = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]); const float min_weight_in_table = 0.0005; const float stretch = NLMEANS_EXPSIZE / (-log(min_weight_in_table)); *(weight_fact_table) = weight_factor * stretch; *(diff_max) = NLMEANS_EXPSIZE / *(weight_fact_table); for (int i = 0; i < NLMEANS_EXPSIZE; i++) { exptable[i] = exp(-i/stretch); } exptable[NLMEANS_EXPSIZE-1] = 0; } // Threads if (pv->threads < 1) { pv->threads = hb_get_cpu_count(); // Reduce internal thread count where we have many logical cores // Too many threads increases CPU cache pressure, reducing performance if (pv->threads >= 32) { pv->threads = pv->threads / 2; } else if (pv->threads >= 16) { pv->threads = (pv->threads / 4) * 3; } } hb_log("NLMeans using %i threads", pv->threads); pv->frame = calloc(pv->threads + pv->max_frames, sizeof(Frame)); if (pv->frame == NULL) { hb_error("nlmeans: calloc failed"); goto fail; } for (int ii = 0; ii < pv->threads + pv->max_frames; ii++) { for (int c = 0; c < 3; c++) { pv->frame[ii].plane[c].mutex = hb_lock_init(); } } pv->thread_data = malloc(pv->threads * sizeof(nlmeans_thread_arg_t*)); if (taskset_init(&pv->taskset, "nlmeans_filter", pv->threads, sizeof(nlmeans_thread_arg_t), nlmeans_filter_work) == 0) { hb_error("NLMeans could not initialize taskset"); goto fail; } for (int ii = 0; ii < pv->threads; ii++) { pv->thread_data[ii] = taskset_thread_args(&pv->taskset, ii); if (pv->thread_data[ii] == NULL) { hb_error("NLMeans could not create thread args"); goto fail; } pv->thread_data[ii]->pv = pv; pv->thread_data[ii]->arg.taskset = &pv->taskset; pv->thread_data[ii]->arg.segment = ii; } pv->output = *init; return 0; fail: taskset_fini(&pv->taskset); free(pv->thread_data); free(pv); return -1; } static void nlmeans_close(hb_filter_object_t *filter) { hb_filter_private_t *pv = filter->private_data; if (pv == NULL) { return; } taskset_fini(&pv->taskset); for (int c = 0; c < 3; c++) { for (int f = 0; f < pv->nframes[c]; f++) { if (pv->frame[f].plane[c].mem_pre != NULL && pv->frame[f].plane[c].mem_pre != pv->frame[f].plane[c].mem) { free(pv->frame[f].plane[c].mem_pre); pv->frame[f].plane[c].mem_pre = NULL; } if (pv->frame[f].plane[c].mem != NULL) { free(pv->frame[f].plane[c].mem); pv->frame[f].plane[c].mem = NULL; } hb_buffer_close(&pv->frame[f].buf); } } for (int ii = 0; ii < pv->threads + pv->max_frames; ii++) { for (int c = 0; c < 3; c++) { hb_lock_close(&pv->frame[ii].plane[c].mutex); } } free(pv->frame); free(pv->thread_data); free(pv); filter->private_data = NULL; } static void nlmeans_filter_work(void *thread_args_v) { nlmeans_thread_arg_t *thread_data = thread_args_v; hb_filter_private_t *pv = thread_data->pv; int segment = thread_data->arg.segment; Frame *frame = &pv->frame[segment]; hb_buffer_t *buf; buf = hb_frame_buffer_init(pv->output.pix_fmt, frame->width, frame->height); buf->f.color_prim = pv->output.color_prim; buf->f.color_transfer = pv->output.color_transfer; buf->f.color_matrix = pv->output.color_matrix; buf->f.color_range = pv->output.color_range ; buf->f.chroma_location = pv->output.chroma_location; NLMeansFunctions *functions = &pv->functions; for (int c = 0; c < 3; c++) { if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU) { pv->nlmeans_prefilter(&frame->plane[c], pv->prefilter[c]); pv->nlmeans_deborder(&frame->plane[c], buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height); continue; } if (pv->strength[c] == 0) { pv->nlmeans_deborder(&frame->plane[c], buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height); continue; } // Process current plane pv->nlmeans_plane(functions, frame, pv->prefilter[c], c, pv->nframes[c], buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height, pv->strength[c], pv->origin_tune[c], pv->patch_size[c], pv->range[c], pv->exptable[c], pv->weight_fact_table[c], pv->diff_max[c]); } hb_buffer_copy_props(buf, pv->frame[segment].buf); hb_buffer_close(&pv->frame[segment].buf); thread_data->out = buf; } static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf) { for (int c = 0; c < 3; c++) { // Extend copy of plane with extra border and place in buffer const int border = ((pv->patch_size[c] + 2) / 2 + 15) / 16 * 16; pv->nlmeans_alloc(buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height, &pv->frame[pv->next_frame].plane[c], border); } pv->frame[pv->next_frame].width = buf->f.width; pv->frame[pv->next_frame].height = buf->f.height; pv->frame[pv->next_frame].fmt = buf->f.fmt; pv->frame[pv->next_frame].buf = hb_buffer_init(0); hb_buffer_copy_props(pv->frame[pv->next_frame].buf, buf); pv->next_frame++; } static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv) { if (pv->next_frame < pv->max_frames + pv->threads) { return NULL; } taskset_cycle(&pv->taskset); // Free buffers that are not needed for next taskset cycle for (int c = 0; c < 3; c++) { for (int t = 0; t < pv->threads; t++) { // Release last frame in buffer if (pv->frame[t].plane[c].mem_pre != NULL && pv->frame[t].plane[c].mem_pre != pv->frame[t].plane[c].mem) { free(pv->frame[t].plane[c].mem_pre); pv->frame[t].plane[c].mem_pre = NULL; } if (pv->frame[t].plane[c].mem != NULL) { free(pv->frame[t].plane[c].mem); pv->frame[t].plane[c].mem = NULL; } } } // Shift frames in buffer down for (int f = 0; f < pv->max_frames; f++) { // Don't move the mutex! Frame frame = pv->frame[f]; pv->frame[f] = pv->frame[f+pv->threads]; for (int c = 0; c < 3; c++) { pv->frame[f].plane[c].mutex = frame.plane[c].mutex; pv->frame[f+pv->threads].plane[c].mem_pre = NULL; pv->frame[f+pv->threads].plane[c].mem = NULL; } } pv->next_frame -= pv->threads; // Collect results from taskset hb_buffer_list_t list; hb_buffer_list_clear(&list); for (int t = 0; t < pv->threads; t++) { hb_buffer_list_append(&list, pv->thread_data[t]->out); } return hb_buffer_list_clear(&list); } static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv) { hb_buffer_list_t list; hb_buffer_list_clear(&list); for (int f = 0; f < pv->next_frame; f++) { Frame *frame = &pv->frame[f]; hb_buffer_t *buf; buf = hb_frame_buffer_init(pv->output.pix_fmt, frame->width, frame->height); buf->f.color_prim = pv->output.color_prim; buf->f.color_transfer = pv->output.color_transfer; buf->f.color_matrix = pv->output.color_matrix; buf->f.color_range = pv->output.color_range; buf->f.chroma_location = pv->output.chroma_location; NLMeansFunctions *functions = &pv->functions; for (int c = 0; c < 3; c++) { if (pv->prefilter[c] & NLMEANS_PREFILTER_MODE_PASSTHRU) { pv->nlmeans_prefilter(&frame->plane[c], pv->prefilter[c]); pv->nlmeans_deborder(&frame->plane[c], buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height); continue; } if (pv->strength[c] == 0) { pv->nlmeans_deborder(&frame->plane[c], buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height); continue; } int nframes = pv->next_frame - f; if (pv->nframes[c] < nframes) { nframes = pv->nframes[c]; } // Process current plane pv->nlmeans_plane(functions, frame, pv->prefilter[c], c, nframes, buf->plane[c].data, buf->plane[c].width, buf->plane[c].stride / pv->bps, buf->plane[c].height, pv->strength[c], pv->origin_tune[c], pv->patch_size[c], pv->range[c], pv->exptable[c], pv->weight_fact_table[c], pv->diff_max[c]); } hb_buffer_copy_props(buf, frame->buf); hb_buffer_close(&frame->buf); hb_buffer_list_append(&list, buf); } return hb_buffer_list_clear(&list); } static int nlmeans_work(hb_filter_object_t *filter, hb_buffer_t **buf_in, hb_buffer_t **buf_out ) { hb_filter_private_t *pv = filter->private_data; hb_buffer_t *in = *buf_in; if (in->s.flags & HB_BUF_FLAG_EOF) { hb_buffer_list_t list; hb_buffer_t *buf; // Flush buffered frames buf = nlmeans_filter_flush(pv); hb_buffer_list_set(&list, buf); // And terminate the buffer list with a EOF buffer hb_buffer_list_append(&list, in); *buf_out = hb_buffer_list_clear(&list); *buf_in = NULL; return HB_FILTER_DONE; } nlmeans_add_frame(pv, in); *buf_out = nlmeans_filter(pv); return HB_FILTER_OK; }