af_sofalizer.c 33.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*****************************************************************************
 * sofalizer.c : SOFAlizer filter for virtual binaural acoustics
 *****************************************************************************
 * Copyright (C) 2013-2015 Andreas Fuchs, Wolfgang Hrauda,
 *                         Acoustics Research Institute (ARI), Vienna, Austria
 *
 * Authors: Andreas Fuchs <andi.fuchs.mail@gmail.com>
 *          Wolfgang Hrauda <wolfgang.hrauda@gmx.at>
 *
 * SOFAlizer project coordinator at ARI, main developer of SOFA:
 *          Piotr Majdak <piotr@majdak.at>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 *****************************************************************************/

#include <math.h>
29
#include <mysofa.h>
30

31
#include "libavcodec/avfft.h"
32 33
#include "libavutil/avstring.h"
#include "libavutil/channel_layout.h"
34
#include "libavutil/float_dsp.h"
35
#include "libavutil/intmath.h"
36 37 38 39 40
#include "libavutil/opt.h"
#include "avfilter.h"
#include "internal.h"
#include "audio.h"

41 42 43
#define TIME_DOMAIN      0
#define FREQUENCY_DOMAIN 1

44 45
typedef struct MySofa {  /* contains data of one SOFA file */
    struct MYSOFA_EASY *easy;
46
    int n_samples;       /* length of one impulse response (IR) */
47 48 49
    float *lir, *rir;    /* IRs (time-domain) */
    int max_delay;
} MySofa;
50

51 52 53 54 55 56
typedef struct VirtualSpeaker {
    uint8_t set;
    float azim;
    float elev;
} VirtualSpeaker;

57 58 59 60
typedef struct SOFAlizerContext {
    const AVClass *class;

    char *filename;             /* name of SOFA file */
61
    MySofa sofa;                /* contains data of the SOFA file */
62 63

    int sample_rate;            /* sample rate from SOFA file */
64 65
    float *speaker_azim;        /* azimuth of the virtual loudspeakers */
    float *speaker_elev;        /* elevation of the virtual loudspeakers */
66
    char *speakers_pos;         /* custom positions of the virtual loudspeakers */
67
    float lfe_gain;             /* initial gain for the LFE channel */
68
    float gain_lfe;             /* gain applied to LFE channel */
69
    int lfe_channel;            /* LFE channel position in channel layout */
70 71 72 73 74 75 76 77 78

    int n_conv;                 /* number of channels to convolute */

                                /* buffer variables (for convolution) */
    float *ringbuffer[2];       /* buffers input samples, length of one buffer: */
                                /* no. input ch. (incl. LFE) x buffer_length */
    int write[2];               /* current write position to ringbuffer */
    int buffer_length;          /* is: longest IR plus max. delay in all SOFA files */
                                /* then choose next power of 2 */
79
    int n_fft;                  /* number of samples in one FFT block */
80 81 82 83 84 85 86

                                /* netCDF variables */
    int *delay[2];              /* broadband delay for each channel/IR to be convolved */

    float *data_ir[2];          /* IRs for all channels to be convolved */
                                /* (this excludes the LFE) */
    float *temp_src[2];
87
    FFTComplex *temp_fft[2];
88 89 90 91 92 93

                         /* control variables */
    float gain;          /* filter gain (in dB) */
    float rotation;      /* rotation of virtual loudspeakers (in degrees)  */
    float elevation;     /* elevation of virtual loudspeakers (in deg.) */
    float radius;        /* distance virtual loudspeakers to listener (in metres) */
94 95
    int type;            /* processing type */

96 97
    VirtualSpeaker vspkrpos[64];

98 99
    FFTContext *fft[2], *ifft[2];
    FFTComplex *data_hrtf[2];
100 101 102 103

    AVFloatDSPContext *fdsp;
} SOFAlizerContext;

104
static int close_sofa(struct MySofa *sofa)
105
{
106 107
    mysofa_close(sofa->easy);
    sofa->easy = NULL;
108 109 110 111

    return 0;
}

112
static int preload_sofa(AVFilterContext *ctx, char *filename, int *samplingrate)
113 114
{
    struct SOFAlizerContext *s = ctx->priv;
115 116
    struct MYSOFA_HRTF *mysofa;
    int ret;
117

118 119 120
    mysofa = mysofa_load(filename, &ret);
    if (ret || !mysofa) {
        av_log(ctx, AV_LOG_ERROR, "Can't find SOFA-file '%s'\n", filename);
121 122 123
        return AVERROR(EINVAL);
    }

124
    if (mysofa->DataSamplingRate.elements != 1)
125
        return AVERROR(EINVAL);
126 127 128
    *samplingrate = mysofa->DataSamplingRate.values[0];
    s->sofa.n_samples = mysofa->N;
    mysofa_free(mysofa);
129

130 131 132
    return 0;
}

133
static int parse_channel_name(char **arg, int *rchannel, char *buf)
134 135 136 137 138
{
    int len, i, channel_id = 0;
    int64_t layout, layout0;

    /* try to parse a channel name, e.g. "FL" */
139
    if (av_sscanf(*arg, "%7[A-Z]%n", buf, &len)) {
140 141 142
        layout0 = layout = av_get_channel_layout(buf);
        /* channel_id <- first set bit in layout */
        for (i = 32; i > 0; i >>= 1) {
143
            if (layout >= 1LL << i) {
144 145 146 147 148
                channel_id += i;
                layout >>= i;
            }
        }
        /* reject layouts that are not a single channel */
149
        if (channel_id >= 64 || layout0 != 1LL << channel_id)
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
            return AVERROR(EINVAL);
        *rchannel = channel_id;
        *arg += len;
        return 0;
    }
    return AVERROR(EINVAL);
}

static void parse_speaker_pos(AVFilterContext *ctx, int64_t in_channel_layout)
{
    SOFAlizerContext *s = ctx->priv;
    char *arg, *tokenizer, *p, *args = av_strdup(s->speakers_pos);

    if (!args)
        return;
    p = args;

    while ((arg = av_strtok(p, "|", &tokenizer))) {
168
        char buf[8];
169 170 171 172
        float azim, elev;
        int out_ch_id;

        p = NULL;
173 174
        if (parse_channel_name(&arg, &out_ch_id, buf)) {
            av_log(ctx, AV_LOG_WARNING, "Failed to parse \'%s\' as channel name.\n", buf);
175
            continue;
176
        }
177
        if (av_sscanf(arg, "%f %f", &azim, &elev) == 2) {
178 179 180
            s->vspkrpos[out_ch_id].set = 1;
            s->vspkrpos[out_ch_id].azim = azim;
            s->vspkrpos[out_ch_id].elev = elev;
181
        } else if (av_sscanf(arg, "%f", &azim) == 1) {
182 183 184 185 186 187 188 189 190
            s->vspkrpos[out_ch_id].set = 1;
            s->vspkrpos[out_ch_id].azim = azim;
            s->vspkrpos[out_ch_id].elev = 0;
        }
    }

    av_free(args);
}

191 192
static int get_speaker_pos(AVFilterContext *ctx,
                           float *speaker_azim, float *speaker_elev)
193 194 195
{
    struct SOFAlizerContext *s = ctx->priv;
    uint64_t channels_layout = ctx->inputs[0]->channel_layout;
196 197 198 199 200 201
    float azim[16] = { 0 };
    float elev[16] = { 0 };
    int m, ch, n_conv = ctx->inputs[0]->channels; /* get no. input channels */

    if (n_conv > 16)
        return AVERROR(EINVAL);
202

203
    s->lfe_channel = -1;
204

205 206 207
    if (s->speakers_pos)
        parse_speaker_pos(ctx, channels_layout);

208
    /* set speaker positions according to input channel configuration: */
209
    for (m = 0, ch = 0; ch < n_conv && m < 64; m++) {
210
        uint64_t mask = channels_layout & (1ULL << m);
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248

        switch (mask) {
        case AV_CH_FRONT_LEFT:            azim[ch] =  30;      break;
        case AV_CH_FRONT_RIGHT:           azim[ch] = 330;      break;
        case AV_CH_FRONT_CENTER:          azim[ch] =   0;      break;
        case AV_CH_LOW_FREQUENCY:
        case AV_CH_LOW_FREQUENCY_2:       s->lfe_channel = ch; break;
        case AV_CH_BACK_LEFT:             azim[ch] = 150;      break;
        case AV_CH_BACK_RIGHT:            azim[ch] = 210;      break;
        case AV_CH_BACK_CENTER:           azim[ch] = 180;      break;
        case AV_CH_SIDE_LEFT:             azim[ch] =  90;      break;
        case AV_CH_SIDE_RIGHT:            azim[ch] = 270;      break;
        case AV_CH_FRONT_LEFT_OF_CENTER:  azim[ch] =  15;      break;
        case AV_CH_FRONT_RIGHT_OF_CENTER: azim[ch] = 345;      break;
        case AV_CH_TOP_CENTER:            azim[ch] =   0;
                                          elev[ch] =  90;      break;
        case AV_CH_TOP_FRONT_LEFT:        azim[ch] =  30;
                                          elev[ch] =  45;      break;
        case AV_CH_TOP_FRONT_CENTER:      azim[ch] =   0;
                                          elev[ch] =  45;      break;
        case AV_CH_TOP_FRONT_RIGHT:       azim[ch] = 330;
                                          elev[ch] =  45;      break;
        case AV_CH_TOP_BACK_LEFT:         azim[ch] = 150;
                                          elev[ch] =  45;      break;
        case AV_CH_TOP_BACK_RIGHT:        azim[ch] = 210;
                                          elev[ch] =  45;      break;
        case AV_CH_TOP_BACK_CENTER:       azim[ch] = 180;
                                          elev[ch] =  45;      break;
        case AV_CH_WIDE_LEFT:             azim[ch] =  90;      break;
        case AV_CH_WIDE_RIGHT:            azim[ch] = 270;      break;
        case AV_CH_SURROUND_DIRECT_LEFT:  azim[ch] =  90;      break;
        case AV_CH_SURROUND_DIRECT_RIGHT: azim[ch] = 270;      break;
        case AV_CH_STEREO_LEFT:           azim[ch] =  90;      break;
        case AV_CH_STEREO_RIGHT:          azim[ch] = 270;      break;
        case 0:                                                break;
        default:
            return AVERROR(EINVAL);
        }
249 250 251 252 253 254

        if (s->vspkrpos[m].set) {
            azim[ch] = s->vspkrpos[m].azim;
            elev[ch] = s->vspkrpos[m].elev;
        }

255 256
        if (mask)
            ch++;
257 258
    }

259 260
    memcpy(speaker_azim, azim, n_conv * sizeof(float));
    memcpy(speaker_elev, elev, n_conv * sizeof(float));
261 262 263 264 265 266 267 268 269 270 271 272 273

    return 0;

}

typedef struct ThreadData {
    AVFrame *in, *out;
    int *write;
    int **delay;
    float **ir;
    int *n_clippings;
    float **ringbuffer;
    float **temp_src;
274
    FFTComplex **temp_fft;
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
} ThreadData;

static int sofalizer_convolute(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    SOFAlizerContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *in = td->in, *out = td->out;
    int offset = jobnr;
    int *write = &td->write[jobnr];
    const int *const delay = td->delay[jobnr];
    const float *const ir = td->ir[jobnr];
    int *n_clippings = &td->n_clippings[jobnr];
    float *ringbuffer = td->ringbuffer[jobnr];
    float *temp_src = td->temp_src[jobnr];
    const int n_samples = s->sofa.n_samples; /* length of one IR */
    const float *src = (const float *)in->data[0]; /* get pointer to audio input buffer */
    float *dst = (float *)out->data[0]; /* get pointer to audio output buffer */
292
    const int in_channels = s->n_conv; /* number of input channels */
293
    /* ring buffer length is: longest IR plus max. delay -> next power of 2 */
294
    const int buffer_length = s->buffer_length;
295
    /* -1 for AND instead of MODULO (applied to powers of 2): */
296
    const uint32_t modulo = (uint32_t)buffer_length - 1;
297
    float *buffer[16]; /* holds ringbuffer for each input channel */
298 299
    int wr = *write;
    int read;
300
    int i, l;
301 302 303 304 305 306 307 308 309 310

    dst += offset;
    for (l = 0; l < in_channels; l++) {
        /* get starting address of ringbuffer for each input channel */
        buffer[l] = ringbuffer + l * buffer_length;
    }

    for (i = 0; i < in->nb_samples; i++) {
        const float *temp_ir = ir; /* using same set of IRs for each sample */

311
        dst[0] = 0;
312 313
        for (l = 0; l < in_channels; l++) {
            /* write current input sample to ringbuffer (for each channel) */
314
            buffer[l][wr] = src[l];
315 316
        }

317 318
        /* loop goes through all channels to be convolved */
        for (l = 0; l < in_channels; l++) {
319 320
            const float *const bptr = buffer[l];

321 322 323 324
            if (l == s->lfe_channel) {
                /* LFE is an input channel but requires no convolution */
                /* apply gain to LFE signal and add to output buffer */
                *dst += *(buffer[s->lfe_channel] + wr) * s->gain_lfe;
325
                temp_ir += FFALIGN(n_samples, 32);
326 327 328
                continue;
            }

329 330 331
            /* current read position in ringbuffer: input sample write position
             * - delay for l-th ch. + diff. betw. IR length and buffer length
             * (mod buffer length) */
332
            read = (wr - delay[l] - (n_samples - 1) + buffer_length) & modulo;
333

334
            if (read + n_samples < buffer_length) {
335
                memmove(temp_src, bptr + read, n_samples * sizeof(*temp_src));
336
            } else {
337 338
                int len = FFMIN(n_samples - (read % n_samples), buffer_length - read);

339 340
                memmove(temp_src, bptr + read, len * sizeof(*temp_src));
                memmove(temp_src + len, bptr, (n_samples - len) * sizeof(*temp_src));
341
            }
342 343 344

            /* multiply signal and IR, and add up the results */
            dst[0] += s->fdsp->scalarproduct_float(temp_ir, temp_src, n_samples);
345
            temp_ir += FFALIGN(n_samples, 32);
346 347 348
        }

        /* clippings counter */
349
        if (fabs(dst[0]) > 1)
350 351 352 353 354 355 356 357 358 359 360 361 362
            *n_clippings += 1;

        /* move output buffer pointer by +2 to get to next sample of processed channel: */
        dst += 2;
        src += in_channels;
        wr   = (wr + 1) & modulo; /* update ringbuffer write position */
    }

    *write = wr; /* remember write position in ringbuffer for next call */

    return 0;
}

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
static int sofalizer_fast_convolute(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
    SOFAlizerContext *s = ctx->priv;
    ThreadData *td = arg;
    AVFrame *in = td->in, *out = td->out;
    int offset = jobnr;
    int *write = &td->write[jobnr];
    FFTComplex *hrtf = s->data_hrtf[jobnr]; /* get pointers to current HRTF data */
    int *n_clippings = &td->n_clippings[jobnr];
    float *ringbuffer = td->ringbuffer[jobnr];
    const int n_samples = s->sofa.n_samples; /* length of one IR */
    const float *src = (const float *)in->data[0]; /* get pointer to audio input buffer */
    float *dst = (float *)out->data[0]; /* get pointer to audio output buffer */
    const int in_channels = s->n_conv; /* number of input channels */
    /* ring buffer length is: longest IR plus max. delay -> next power of 2 */
    const int buffer_length = s->buffer_length;
    /* -1 for AND instead of MODULO (applied to powers of 2): */
    const uint32_t modulo = (uint32_t)buffer_length - 1;
    FFTComplex *fft_in = s->temp_fft[jobnr]; /* temporary array for FFT input/output data */
    FFTContext *ifft = s->ifft[jobnr];
    FFTContext *fft = s->fft[jobnr];
    const int n_conv = s->n_conv;
    const int n_fft = s->n_fft;
386 387
    const float fft_scale = 1.0f / s->n_fft;
    FFTComplex *hrtf_offset;
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
    int wr = *write;
    int n_read;
    int i, j;

    dst += offset;

    /* find minimum between number of samples and output buffer length:
     * (important, if one IR is longer than the output buffer) */
    n_read = FFMIN(s->sofa.n_samples, in->nb_samples);
    for (j = 0; j < n_read; j++) {
        /* initialize output buf with saved signal from overflow buf */
        dst[2 * j]     = ringbuffer[wr];
        ringbuffer[wr] = 0.0; /* re-set read samples to zero */
        /* update ringbuffer read/write position */
        wr  = (wr + 1) & modulo;
    }

    /* initialize rest of output buffer with 0 */
    for (j = n_read; j < in->nb_samples; j++) {
        dst[2 * j] = 0;
    }

    for (i = 0; i < n_conv; i++) {
        if (i == s->lfe_channel) { /* LFE */
            for (j = 0; j < in->nb_samples; j++) {
                /* apply gain to LFE signal and add to output buffer */
                dst[2 * j] += src[i + j * in_channels] * s->gain_lfe;
            }
            continue;
        }

        /* outer loop: go through all input channels to be convolved */
        offset = i * n_fft; /* no. samples already processed */
421
        hrtf_offset = hrtf + offset;
422 423 424 425 426 427 428 429 430 431 432 433 434 435

        /* fill FFT input with 0 (we want to zero-pad) */
        memset(fft_in, 0, sizeof(FFTComplex) * n_fft);

        for (j = 0; j < in->nb_samples; j++) {
            /* prepare input for FFT */
            /* write all samples of current input channel to FFT input array */
            fft_in[j].re = src[j * in_channels + i];
        }

        /* transform input signal of current channel to frequency domain */
        av_fft_permute(fft, fft_in);
        av_fft_calc(fft, fft_in);
        for (j = 0; j < n_fft; j++) {
436
            const FFTComplex *hcomplex = hrtf_offset + j;
437 438 439 440 441
            const float re = fft_in[j].re;
            const float im = fft_in[j].im;

            /* complex multiplication of input signal and HRTFs */
            /* output channel (real): */
442
            fft_in[j].re = re * hcomplex->re - im * hcomplex->im;
443
            /* output channel (imag): */
444
            fft_in[j].im = re * hcomplex->im + im * hcomplex->re;
445 446 447 448 449 450 451 452
        }

        /* transform output signal of current channel back to time domain */
        av_fft_permute(ifft, fft_in);
        av_fft_calc(ifft, fft_in);

        for (j = 0; j < in->nb_samples; j++) {
            /* write output signal of current channel to output buffer */
453
            dst[2 * j] += fft_in[j].re * fft_scale;
454 455 456 457 458 459
        }

        for (j = 0; j < n_samples - 1; j++) { /* overflow length is IR length - 1 */
            /* write the rest of output signal to overflow buffer */
            int write_pos = (wr + j) & modulo;

460
            *(ringbuffer + write_pos) += fft_in[in->nb_samples + j].re * fft_scale;
461 462 463 464 465 466 467
        }
    }

    /* go through all samples of current output buffer: count clippings */
    for (i = 0; i < out->nb_samples; i++) {
        /* clippings counter */
        if (fabs(*dst) > 1) { /* if current output sample > 1 */
468
            n_clippings[0]++;
469 470 471 472 473 474 475 476 477 478 479 480
        }

        /* move output buffer pointer by +2 to get to next sample of processed channel: */
        dst += 2;
    }

    /* remember read/write position in ringbuffer for next call */
    *write = wr;

    return 0;
}

481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
    AVFilterContext *ctx = inlink->dst;
    SOFAlizerContext *s = ctx->priv;
    AVFilterLink *outlink = ctx->outputs[0];
    int n_clippings[2] = { 0 };
    ThreadData td;
    AVFrame *out;

    out = ff_get_audio_buffer(outlink, in->nb_samples);
    if (!out) {
        av_frame_free(&in);
        return AVERROR(ENOMEM);
    }
    av_frame_copy_props(out, in);

    td.in = in; td.out = out; td.write = s->write;
    td.delay = s->delay; td.ir = s->data_ir; td.n_clippings = n_clippings;
    td.ringbuffer = s->ringbuffer; td.temp_src = s->temp_src;
500
    td.temp_fft = s->temp_fft;
501

502 503 504 505 506
    if (s->type == TIME_DOMAIN) {
        ctx->internal->execute(ctx, sofalizer_convolute, &td, NULL, 2);
    } else {
        ctx->internal->execute(ctx, sofalizer_fast_convolute, &td, NULL, 2);
    }
507 508
    emms_c();

509
    /* display error message if clipping occurred */
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
    if (n_clippings[0] + n_clippings[1] > 0) {
        av_log(ctx, AV_LOG_WARNING, "%d of %d samples clipped. Please reduce gain.\n",
               n_clippings[0] + n_clippings[1], out->nb_samples * 2);
    }

    av_frame_free(&in);
    return ff_filter_frame(outlink, out);
}

static int query_formats(AVFilterContext *ctx)
{
    struct SOFAlizerContext *s = ctx->priv;
    AVFilterFormats *formats = NULL;
    AVFilterChannelLayouts *layouts = NULL;
    int ret, sample_rates[] = { 48000, -1 };

    ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLT);
    if (ret)
        return ret;
    ret = ff_set_common_formats(ctx, formats);
    if (ret)
        return ret;

533
    layouts = ff_all_channel_layouts();
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
    if (!layouts)
        return AVERROR(ENOMEM);

    ret = ff_channel_layouts_ref(layouts, &ctx->inputs[0]->out_channel_layouts);
    if (ret)
        return ret;

    layouts = NULL;
    ret = ff_add_channel_layout(&layouts, AV_CH_LAYOUT_STEREO);
    if (ret)
        return ret;

    ret = ff_channel_layouts_ref(layouts, &ctx->outputs[0]->in_channel_layouts);
    if (ret)
        return ret;

    sample_rates[0] = s->sample_rate;
    formats = ff_make_format_list(sample_rates);
    if (!formats)
        return AVERROR(ENOMEM);
    return ff_set_common_samplerates(ctx, formats);
}

557
static int load_data(AVFilterContext *ctx, int azim, int elev, float radius, int sample_rate)
558 559
{
    struct SOFAlizerContext *s = ctx->priv;
560
    int n_samples;
561
    int n_conv = s->n_conv; /* no. channels to convolve */
562 563 564
    int n_fft;
    float delay_l; /* broadband delay for each IR */
    float delay_r;
565 566
    int nb_input_channels = ctx->inputs[0]->channels; /* no. input channels */
    float gain_lin = expf((s->gain - 3 * nb_input_channels) / 20 * M_LN10); /* gain - 3dB/channel */
567 568 569 570
    FFTComplex *data_hrtf_l = NULL;
    FFTComplex *data_hrtf_r = NULL;
    FFTComplex *fft_in_l = NULL;
    FFTComplex *fft_in_r = NULL;
571 572 573
    float *data_ir_l = NULL;
    float *data_ir_r = NULL;
    int offset = 0; /* used for faster pointer arithmetics in for-loop */
574
    int i, j, azim_orig = azim, elev_orig = elev;
575 576 577
    int filter_length, ret = 0;
    int n_current;
    int n_max = 0;
578

579 580
    s->sofa.easy = mysofa_open(s->filename, sample_rate, &filter_length, &ret);
    if (!s->sofa.easy || ret) { /* if an invalid SOFA file has been selected */
581 582 583 584
        av_log(ctx, AV_LOG_ERROR, "Selected SOFA file is invalid. Please select valid SOFA file.\n");
        return AVERROR_INVALIDDATA;
    }

585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
    n_samples = s->sofa.n_samples;

    s->data_ir[0] = av_calloc(FFALIGN(n_samples, 32), sizeof(float) * s->n_conv);
    s->data_ir[1] = av_calloc(FFALIGN(n_samples, 32), sizeof(float) * s->n_conv);
    s->delay[0] = av_calloc(s->n_conv, sizeof(int));
    s->delay[1] = av_calloc(s->n_conv, sizeof(int));

    if (!s->data_ir[0] || !s->data_ir[1] || !s->delay[0] || !s->delay[1]) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

    /* get temporary IR for L and R channel */
    data_ir_l = av_calloc(n_conv * FFALIGN(n_samples, 32), sizeof(*data_ir_l));
    data_ir_r = av_calloc(n_conv * FFALIGN(n_samples, 32), sizeof(*data_ir_r));
    if (!data_ir_r || !data_ir_l) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

605
    if (s->type == TIME_DOMAIN) {
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
        s->temp_src[0] = av_calloc(FFALIGN(n_samples, 32), sizeof(float));
        s->temp_src[1] = av_calloc(FFALIGN(n_samples, 32), sizeof(float));
        if (!s->temp_src[0] || !s->temp_src[1]) {
            ret = AVERROR(ENOMEM);
            goto fail;
        }
    }

    s->speaker_azim = av_calloc(s->n_conv, sizeof(*s->speaker_azim));
    s->speaker_elev = av_calloc(s->n_conv, sizeof(*s->speaker_elev));
    if (!s->speaker_azim || !s->speaker_elev) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

    /* get speaker positions */
    if ((ret = get_speaker_pos(ctx, s->speaker_azim, s->speaker_elev)) < 0) {
        av_log(ctx, AV_LOG_ERROR, "Couldn't get speaker positions. Input channel configuration not supported.\n");
        goto fail;
    }

    for (i = 0; i < s->n_conv; i++) {
        float coordinates[3];

        /* load and store IRs and corresponding delays */
        azim = (int)(s->speaker_azim[i] + azim_orig) % 360;
        elev = (int)(s->speaker_elev[i] + elev_orig) % 90;

        coordinates[0] = azim;
        coordinates[1] = elev;
        coordinates[2] = radius;

        mysofa_s2c(coordinates);

        /* get id of IR closest to desired position */
        mysofa_getfilter_float(s->sofa.easy, coordinates[0], coordinates[1], coordinates[2],
                               data_ir_l + FFALIGN(n_samples, 32) * i,
                               data_ir_r + FFALIGN(n_samples, 32) * i,
                               &delay_l, &delay_r);

        s->delay[0][i] = delay_l * sample_rate;
        s->delay[1][i] = delay_r * sample_rate;

        s->sofa.max_delay = FFMAX3(s->sofa.max_delay, s->delay[0][i], s->delay[1][i]);
    }

    /* get size of ringbuffer (longest IR plus max. delay) */
    /* then choose next power of 2 for performance optimization */
    n_current = s->sofa.n_samples + s->sofa.max_delay;
    /* length of longest IR plus max. delay */
    n_max = FFMAX(n_max, n_current);

    /* buffer length is longest IR plus max. delay -> next power of 2
       (32 - count leading zeros gives required exponent)  */
    s->buffer_length = 1 << (32 - ff_clz(n_max));
    s->n_fft = n_fft = 1 << (32 - ff_clz(n_max + sample_rate));

    if (s->type == FREQUENCY_DOMAIN) {
        av_fft_end(s->fft[0]);
        av_fft_end(s->fft[1]);
        s->fft[0] = av_fft_init(log2(s->n_fft), 0);
        s->fft[1] = av_fft_init(log2(s->n_fft), 0);
        av_fft_end(s->ifft[0]);
        av_fft_end(s->ifft[1]);
        s->ifft[0] = av_fft_init(log2(s->n_fft), 1);
        s->ifft[1] = av_fft_init(log2(s->n_fft), 1);

        if (!s->fft[0] || !s->fft[1] || !s->ifft[0] || !s->ifft[1]) {
            av_log(ctx, AV_LOG_ERROR, "Unable to create FFT contexts of size %d.\n", s->n_fft);
            ret = AVERROR(ENOMEM);
            goto fail;
677
        }
678 679 680 681 682
    }

    if (s->type == TIME_DOMAIN) {
        s->ringbuffer[0] = av_calloc(s->buffer_length, sizeof(float) * nb_input_channels);
        s->ringbuffer[1] = av_calloc(s->buffer_length, sizeof(float) * nb_input_channels);
683 684 685 686 687
    } else {
        /* get temporary HRTF memory for L and R channel */
        data_hrtf_l = av_malloc_array(n_fft, sizeof(*data_hrtf_l) * n_conv);
        data_hrtf_r = av_malloc_array(n_fft, sizeof(*data_hrtf_r) * n_conv);
        if (!data_hrtf_r || !data_hrtf_l) {
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
            ret = AVERROR(ENOMEM);
            goto fail;
        }

        s->ringbuffer[0] = av_calloc(s->buffer_length, sizeof(float));
        s->ringbuffer[1] = av_calloc(s->buffer_length, sizeof(float));
        s->temp_fft[0] = av_malloc_array(s->n_fft, sizeof(FFTComplex));
        s->temp_fft[1] = av_malloc_array(s->n_fft, sizeof(FFTComplex));
        if (!s->temp_fft[0] || !s->temp_fft[1]) {
            ret = AVERROR(ENOMEM);
            goto fail;
        }
    }

    if (!s->ringbuffer[0] || !s->ringbuffer[1]) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

    if (s->type == FREQUENCY_DOMAIN) {
        fft_in_l = av_calloc(n_fft, sizeof(*fft_in_l));
        fft_in_r = av_calloc(n_fft, sizeof(*fft_in_r));
        if (!fft_in_l || !fft_in_r) {
            ret = AVERROR(ENOMEM);
            goto fail;
713
        }
714 715 716
    }

    for (i = 0; i < s->n_conv; i++) {
717
        float *lir, *rir;
718

719 720 721 722
        offset = i * FFALIGN(n_samples, 32); /* no. samples already written */

        lir = data_ir_l + offset;
        rir = data_ir_r + offset;
723

724 725 726 727
        if (s->type == TIME_DOMAIN) {
            for (j = 0; j < n_samples; j++) {
                /* load reversed IRs of the specified source position
                 * sample-by-sample for left and right ear; and apply gain */
728 729
                s->data_ir[0][offset + j] = lir[n_samples - 1 - j] * gain_lin;
                s->data_ir[1][offset + j] = rir[n_samples - 1 - j] * gain_lin;
730 731
            }
        } else {
732 733
            memset(fft_in_l, 0, n_fft * sizeof(*fft_in_l));
            memset(fft_in_r, 0, n_fft * sizeof(*fft_in_r));
734 735 736 737 738 739 740

            offset = i * n_fft; /* no. samples already written */
            for (j = 0; j < n_samples; j++) {
                /* load non-reversed IRs of the specified source position
                 * sample-by-sample and apply gain,
                 * L channel is loaded to real part, R channel to imag part,
                 * IRs ared shifted by L and R delay */
741 742
                fft_in_l[s->delay[0][i] + j].re = lir[j] * gain_lin;
                fft_in_r[s->delay[1][i] + j].re = rir[j] * gain_lin;
743 744 745 746 747 748 749 750 751
            }

            /* actually transform to frequency domain (IRs -> HRTFs) */
            av_fft_permute(s->fft[0], fft_in_l);
            av_fft_calc(s->fft[0], fft_in_l);
            memcpy(data_hrtf_l + offset, fft_in_l, n_fft * sizeof(*fft_in_l));
            av_fft_permute(s->fft[0], fft_in_r);
            av_fft_calc(s->fft[0], fft_in_r);
            memcpy(data_hrtf_r + offset, fft_in_r, n_fft * sizeof(*fft_in_r));
752 753 754
        }
    }

755
    if (s->type == FREQUENCY_DOMAIN) {
756 757 758
        s->data_hrtf[0] = av_malloc_array(n_fft * s->n_conv, sizeof(FFTComplex));
        s->data_hrtf[1] = av_malloc_array(n_fft * s->n_conv, sizeof(FFTComplex));
        if (!s->data_hrtf[0] || !s->data_hrtf[1]) {
759 760
            ret = AVERROR(ENOMEM);
            goto fail;
761 762 763 764 765 766
        }

        memcpy(s->data_hrtf[0], data_hrtf_l, /* copy HRTF data to */
            sizeof(FFTComplex) * n_conv * n_fft); /* filter struct */
        memcpy(s->data_hrtf[1], data_hrtf_r,
            sizeof(FFTComplex) * n_conv * n_fft);
767
    }
768

769 770 771
fail:
    av_freep(&data_hrtf_l); /* free temporary HRTF memory */
    av_freep(&data_hrtf_r);
772

773 774
    av_freep(&data_ir_l); /* free temprary IR memory */
    av_freep(&data_ir_r);
775

776 777
    av_freep(&fft_in_l); /* free temporary FFT memory */
    av_freep(&fft_in_r);
778

779
    return ret;
780 781 782 783 784 785 786
}

static av_cold int init(AVFilterContext *ctx)
{
    SOFAlizerContext *s = ctx->priv;
    int ret;

787 788 789 790 791
    if (!s->filename) {
        av_log(ctx, AV_LOG_ERROR, "Valid SOFA filename must be set.\n");
        return AVERROR(EINVAL);
    }

792 793
    /* preload SOFA file, */
    ret = preload_sofa(ctx, s->filename, &s->sample_rate);
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
    if (ret) {
        /* file loading error */
        av_log(ctx, AV_LOG_ERROR, "Error while loading SOFA file: '%s'\n", s->filename);
    } else { /* no file loading error, resampling not required */
        av_log(ctx, AV_LOG_DEBUG, "File '%s' loaded.\n", s->filename);
    }

    if (ret) {
        av_log(ctx, AV_LOG_ERROR, "No valid SOFA file could be loaded. Please specify valid SOFA file.\n");
        return ret;
    }

    s->fdsp = avpriv_float_dsp_alloc(0);
    if (!s->fdsp)
        return AVERROR(ENOMEM);

    return 0;
}

static int config_input(AVFilterLink *inlink)
{
    AVFilterContext *ctx = inlink->dst;
    SOFAlizerContext *s = ctx->priv;
    int ret;

819 820 821 822 823 824
    if (s->type == FREQUENCY_DOMAIN) {
        inlink->partial_buf_size =
        inlink->min_samples =
        inlink->max_samples = inlink->sample_rate;
    }

825
    /* gain -3 dB per channel, -6 dB to get LFE on a similar level */
826
    s->gain_lfe = expf((s->gain - 3 * inlink->channels - 6 + s->lfe_gain) / 20 * M_LN10);
827

828
    s->n_conv = inlink->channels;
829

830
    /* load IRs to data_ir[0] and data_ir[1] for required directions */
831
    if ((ret = load_data(ctx, s->rotation, s->elevation, s->radius, inlink->sample_rate)) < 0)
832 833 834
        return ret;

    av_log(ctx, AV_LOG_DEBUG, "Samplerate: %d Channels to convolute: %d, Length of ringbuffer: %d x %d\n",
835
        inlink->sample_rate, s->n_conv, inlink->channels, s->buffer_length);
836 837 838 839 840 841 842 843

    return 0;
}

static av_cold void uninit(AVFilterContext *ctx)
{
    SOFAlizerContext *s = ctx->priv;

844
    close_sofa(&s->sofa);
845 846 847 848
    av_fft_end(s->ifft[0]);
    av_fft_end(s->ifft[1]);
    av_fft_end(s->fft[0]);
    av_fft_end(s->fft[1]);
849 850 851 852 853 854
    av_freep(&s->delay[0]);
    av_freep(&s->delay[1]);
    av_freep(&s->data_ir[0]);
    av_freep(&s->data_ir[1]);
    av_freep(&s->ringbuffer[0]);
    av_freep(&s->ringbuffer[1]);
855 856
    av_freep(&s->speaker_azim);
    av_freep(&s->speaker_elev);
857 858
    av_freep(&s->temp_src[0]);
    av_freep(&s->temp_src[1]);
859 860 861 862
    av_freep(&s->temp_fft[0]);
    av_freep(&s->temp_fft[1]);
    av_freep(&s->data_hrtf[0]);
    av_freep(&s->data_hrtf[1]);
863 864 865 866 867 868 869 870 871 872 873 874
    av_freep(&s->fdsp);
}

#define OFFSET(x) offsetof(SOFAlizerContext, x)
#define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM

static const AVOption sofalizer_options[] = {
    { "sofa",      "sofa filename",  OFFSET(filename),  AV_OPT_TYPE_STRING, {.str=NULL},            .flags = FLAGS },
    { "gain",      "set gain in dB", OFFSET(gain),      AV_OPT_TYPE_FLOAT,  {.dbl=0},     -20,  40, .flags = FLAGS },
    { "rotation",  "set rotation"  , OFFSET(rotation),  AV_OPT_TYPE_FLOAT,  {.dbl=0},    -360, 360, .flags = FLAGS },
    { "elevation", "set elevation",  OFFSET(elevation), AV_OPT_TYPE_FLOAT,  {.dbl=0},     -90,  90, .flags = FLAGS },
    { "radius",    "set radius",     OFFSET(radius),    AV_OPT_TYPE_FLOAT,  {.dbl=1},       0,   3, .flags = FLAGS },
875 876 877
    { "type",      "set processing", OFFSET(type),      AV_OPT_TYPE_INT,    {.i64=1},       0,   1, .flags = FLAGS, "type" },
    { "time",      "time domain",      0,               AV_OPT_TYPE_CONST,  {.i64=0},       0,   0, .flags = FLAGS, "type" },
    { "freq",      "frequency domain", 0,               AV_OPT_TYPE_CONST,  {.i64=1},       0,   0, .flags = FLAGS, "type" },
878
    { "speakers",  "set speaker custom positions", OFFSET(speakers_pos), AV_OPT_TYPE_STRING,  {.str=0},    0, 0, .flags = FLAGS },
879
    { "lfegain",   "set lfe gain",                 OFFSET(lfe_gain),     AV_OPT_TYPE_FLOAT,   {.dbl=0},   -9, 9, .flags = FLAGS },
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
    { NULL }
};

AVFILTER_DEFINE_CLASS(sofalizer);

static const AVFilterPad inputs[] = {
    {
        .name         = "default",
        .type         = AVMEDIA_TYPE_AUDIO,
        .config_props = config_input,
        .filter_frame = filter_frame,
    },
    { NULL }
};

static const AVFilterPad outputs[] = {
    {
        .name = "default",
        .type = AVMEDIA_TYPE_AUDIO,
    },
    { NULL }
};

AVFilter ff_af_sofalizer = {
    .name          = "sofalizer",
    .description   = NULL_IF_CONFIG_SMALL("SOFAlizer (Spatially Oriented Format for Acoustics)."),
    .priv_size     = sizeof(SOFAlizerContext),
    .priv_class    = &sofalizer_class,
    .init          = init,
    .uninit        = uninit,
    .query_formats = query_formats,
    .inputs        = inputs,
    .outputs       = outputs,
    .flags         = AVFILTER_FLAG_SLICE_THREADS,
};