misc/echo/echo.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * SpanDSP - a series of DSP components for telephony
0004  *
0005  * echo.c - A line echo canceller.  This code is being developed
0006  *          against and partially complies with G168.
0007  *
0008  * Written by Steve Underwood <steveu@coppice.org>
0009  *         and David Rowe <david_at_rowetel_dot_com>
0010  *
0011  * Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe
0012  *
0013  * Based on a bit from here, a bit from there, eye of toad, ear of
0014  * bat, 15 years of failed attempts by David and a few fried brain
0015  * cells.
0016  *
0017  * All rights reserved.
0018  */
0019
0020 /*! \file */
0021
0022 /* Implementation Notes
0023    David Rowe
0024    April 2007
0025
0026    This code started life as Steve's NLMS algorithm with a tap
0027    rotation algorithm to handle divergence during double talk.  I
0028    added a Geigel Double Talk Detector (DTD) [2] and performed some
0029    G168 tests.  However I had trouble meeting the G168 requirements,
0030    especially for double talk - there were always cases where my DTD
0031    failed, for example where near end speech was under the 6dB
0032    threshold required for declaring double talk.
0033
0034    So I tried a two path algorithm [1], which has so far given better
0035    results.  The original tap rotation/Geigel algorithm is available
0036    in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit.
0037    It's probably possible to make it work if some one wants to put some
0038    serious work into it.
0039
0040    At present no special treatment is provided for tones, which
0041    generally cause NLMS algorithms to diverge.  Initial runs of a
0042    subset of the G168 tests for tones (e.g ./echo_test 6) show the
0043    current algorithm is passing OK, which is kind of surprising.  The
0044    full set of tests needs to be performed to confirm this result.
0045
0046    One other interesting change is that I have managed to get the NLMS
0047    code to work with 16 bit coefficients, rather than the original 32
0048    bit coefficents.  This reduces the MIPs and storage required.
0049    I evaulated the 16 bit port using g168_tests.sh and listening tests
0050    on 4 real-world samples.
0051
0052    I also attempted the implementation of a block based NLMS update
0053    [2] but although this passes g168_tests.sh it didn't converge well
0054    on the real-world samples.  I have no idea why, perhaps a scaling
0055    problem.  The block based code is also available in SVN
0056    http://svn.rowetel.com/software/oslec/tags/before_16bit.  If this
0057    code can be debugged, it will lead to further reduction in MIPS, as
0058    the block update code maps nicely onto DSP instruction sets (it's a
0059    dot product) compared to the current sample-by-sample update.
0060
0061    Steve also has some nice notes on echo cancellers in echo.h
0062
0063    References:
0064
0065    [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
0066        Path Models", IEEE Transactions on communications, COM-25,
0067        No. 6, June
0068        1977.
0069        https://www.rowetel.com/images/echo/dual_path_paper.pdf
0070
0071    [2] The classic, very useful paper that tells you how to
0072        actually build a real world echo canceller:
0073      Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice
0074      Echo Canceller with a TMS320020,
0075      https://www.rowetel.com/images/echo/spra129.pdf
0076
0077    [3] I have written a series of blog posts on this work, here is
0078        Part 1: http://www.rowetel.com/blog/?p=18
0079
0080    [4] The source code http://svn.rowetel.com/software/oslec/
0081
0082    [5] A nice reference on LMS filters:
0083      https://en.wikipedia.org/wiki/Least_mean_squares_filter
0084
0085    Credits:
0086
0087    Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan
0088    Muthukrishnan for their suggestions and email discussions.  Thanks
0089    also to those people who collected echo samples for me such as
0090    Mark, Pawel, and Pavel.
0091 */
0092
0093 #include <linux/kernel.h>
0094 #include <linux/module.h>
0095 #include <linux/slab.h>
0096
0097 #include "echo.h"
0098
0099 #define MIN_TX_POWER_FOR_ADAPTION   64
0100 #define MIN_RX_POWER_FOR_ADAPTION   64
0101 #define DTD_HANGOVER            600 /* 600 samples, or 75ms     */
0102 #define DC_LOG2BETA         3   /* log2() of DC filter Beta */
0103
0104 /* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
0105
0106 static inline void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
0107 {
0108     int i;
0109
0110     int offset1;
0111     int offset2;
0112     int factor;
0113     int exp;
0114
0115     if (shift > 0)
0116         factor = clean << shift;
0117     else
0118         factor = clean >> -shift;
0119
0120     /* Update the FIR taps */
0121
0122     offset2 = ec->curr_pos;
0123     offset1 = ec->taps - offset2;
0124
0125     for (i = ec->taps - 1; i >= offset1; i--) {
0126         exp = (ec->fir_state_bg.history[i - offset1] * factor);
0127         ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
0128     }
0129     for (; i >= 0; i--) {
0130         exp = (ec->fir_state_bg.history[i + offset2] * factor);
0131         ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
0132     }
0133 }
0134
0135 static inline int top_bit(unsigned int bits)
0136 {
0137     if (bits == 0)
0138         return -1;
0139     else
0140         return (int)fls((int32_t) bits) - 1;
0141 }
0142
0143 struct oslec_state *oslec_create(int len, int adaption_mode)
0144 {
0145     struct oslec_state *ec;
0146     int i;
0147     const int16_t *history;
0148
0149     ec = kzalloc(sizeof(*ec), GFP_KERNEL);
0150     if (!ec)
0151         return NULL;
0152
0153     ec->taps = len;
0154     ec->log2taps = top_bit(len);
0155     ec->curr_pos = ec->taps - 1;
0156
0157     ec->fir_taps16[0] =
0158         kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
0159     if (!ec->fir_taps16[0])
0160         goto error_oom_0;
0161
0162     ec->fir_taps16[1] =
0163         kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
0164     if (!ec->fir_taps16[1])
0165         goto error_oom_1;
0166
0167     history = fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps);
0168     if (!history)
0169         goto error_state;
0170     history = fir16_create(&ec->fir_state_bg, ec->fir_taps16[1], ec->taps);
0171     if (!history)
0172         goto error_state_bg;
0173
0174     for (i = 0; i < 5; i++)
0175         ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
0176
0177     ec->cng_level = 1000;
0178     oslec_adaption_mode(ec, adaption_mode);
0179
0180     ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
0181     if (!ec->snapshot)
0182         goto error_snap;
0183
0184     ec->cond_met = 0;
0185     ec->pstates = 0;
0186     ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
0187     ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
0188     ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
0189     ec->lbgn = ec->lbgn_acc = 0;
0190     ec->lbgn_upper = 200;
0191     ec->lbgn_upper_acc = ec->lbgn_upper << 13;
0192
0193     return ec;
0194
0195 error_snap:
0196     fir16_free(&ec->fir_state_bg);
0197 error_state_bg:
0198     fir16_free(&ec->fir_state);
0199 error_state:
0200     kfree(ec->fir_taps16[1]);
0201 error_oom_1:
0202     kfree(ec->fir_taps16[0]);
0203 error_oom_0:
0204     kfree(ec);
0205     return NULL;
0206 }
0207 EXPORT_SYMBOL_GPL(oslec_create);
0208
0209 void oslec_free(struct oslec_state *ec)
0210 {
0211     int i;
0212
0213     fir16_free(&ec->fir_state);
0214     fir16_free(&ec->fir_state_bg);
0215     for (i = 0; i < 2; i++)
0216         kfree(ec->fir_taps16[i]);
0217     kfree(ec->snapshot);
0218     kfree(ec);
0219 }
0220 EXPORT_SYMBOL_GPL(oslec_free);
0221
0222 void oslec_adaption_mode(struct oslec_state *ec, int adaption_mode)
0223 {
0224     ec->adaption_mode = adaption_mode;
0225 }
0226 EXPORT_SYMBOL_GPL(oslec_adaption_mode);
0227
0228 void oslec_flush(struct oslec_state *ec)
0229 {
0230     int i;
0231
0232     ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
0233     ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
0234     ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
0235
0236     ec->lbgn = ec->lbgn_acc = 0;
0237     ec->lbgn_upper = 200;
0238     ec->lbgn_upper_acc = ec->lbgn_upper << 13;
0239
0240     ec->nonupdate_dwell = 0;
0241
0242     fir16_flush(&ec->fir_state);
0243     fir16_flush(&ec->fir_state_bg);
0244     ec->fir_state.curr_pos = ec->taps - 1;
0245     ec->fir_state_bg.curr_pos = ec->taps - 1;
0246     for (i = 0; i < 2; i++)
0247         memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
0248
0249     ec->curr_pos = ec->taps - 1;
0250     ec->pstates = 0;
0251 }
0252 EXPORT_SYMBOL_GPL(oslec_flush);
0253
0254 void oslec_snapshot(struct oslec_state *ec)
0255 {
0256     memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps * sizeof(int16_t));
0257 }
0258 EXPORT_SYMBOL_GPL(oslec_snapshot);
0259
0260 /* Dual Path Echo Canceller */
0261
0262 int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
0263 {
0264     int32_t echo_value;
0265     int clean_bg;
0266     int tmp;
0267     int tmp1;
0268
0269     /*
0270      * Input scaling was found be required to prevent problems when tx
0271      * starts clipping.  Another possible way to handle this would be the
0272      * filter coefficent scaling.
0273      */
0274
0275     ec->tx = tx;
0276     ec->rx = rx;
0277     tx >>= 1;
0278     rx >>= 1;
0279
0280     /*
0281      * Filter DC, 3dB point is 160Hz (I think), note 32 bit precision
0282      * required otherwise values do not track down to 0. Zero at DC, Pole
0283      * at (1-Beta) on real axis.  Some chip sets (like Si labs) don't
0284      * need this, but something like a $10 X100P card does.  Any DC really
0285      * slows down convergence.
0286      *
0287      * Note: removes some low frequency from the signal, this reduces the
0288      * speech quality when listening to samples through headphones but may
0289      * not be obvious through a telephone handset.
0290      *
0291      * Note that the 3dB frequency in radians is approx Beta, e.g. for Beta
0292      * = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
0293      */
0294
0295     if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
0296         tmp = rx << 15;
0297
0298         /*
0299          * Make sure the gain of the HPF is 1.0. This can still
0300          * saturate a little under impulse conditions, and it might
0301          * roll to 32768 and need clipping on sustained peak level
0302          * signals. However, the scale of such clipping is small, and
0303          * the error due to any saturation should not markedly affect
0304          * the downstream processing.
0305          */
0306         tmp -= (tmp >> 4);
0307
0308         ec->rx_1 += -(ec->rx_1 >> DC_LOG2BETA) + tmp - ec->rx_2;
0309
0310         /*
0311          * hard limit filter to prevent clipping.  Note that at this
0312          * stage rx should be limited to +/- 16383 due to right shift
0313          * above
0314          */
0315         tmp1 = ec->rx_1 >> 15;
0316         if (tmp1 > 16383)
0317             tmp1 = 16383;
0318         if (tmp1 < -16383)
0319             tmp1 = -16383;
0320         rx = tmp1;
0321         ec->rx_2 = tmp;
0322     }
0323
0324     /* Block average of power in the filter states.  Used for
0325        adaption power calculation. */
0326
0327     {
0328         int new, old;
0329
0330         /* efficient "out with the old and in with the new" algorithm so
0331            we don't have to recalculate over the whole block of
0332            samples. */
0333         new = (int)tx * (int)tx;
0334         old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
0335             (int)ec->fir_state.history[ec->fir_state.curr_pos];
0336         ec->pstates +=
0337             ((new - old) + (1 << (ec->log2taps - 1))) >> ec->log2taps;
0338         if (ec->pstates < 0)
0339             ec->pstates = 0;
0340     }
0341
0342     /* Calculate short term average levels using simple single pole IIRs */
0343
0344     ec->ltxacc += abs(tx) - ec->ltx;
0345     ec->ltx = (ec->ltxacc + (1 << 4)) >> 5;
0346     ec->lrxacc += abs(rx) - ec->lrx;
0347     ec->lrx = (ec->lrxacc + (1 << 4)) >> 5;
0348
0349     /* Foreground filter */
0350
0351     ec->fir_state.coeffs = ec->fir_taps16[0];
0352     echo_value = fir16(&ec->fir_state, tx);
0353     ec->clean = rx - echo_value;
0354     ec->lcleanacc += abs(ec->clean) - ec->lclean;
0355     ec->lclean = (ec->lcleanacc + (1 << 4)) >> 5;
0356
0357     /* Background filter */
0358
0359     echo_value = fir16(&ec->fir_state_bg, tx);
0360     clean_bg = rx - echo_value;
0361     ec->lclean_bgacc += abs(clean_bg) - ec->lclean_bg;
0362     ec->lclean_bg = (ec->lclean_bgacc + (1 << 4)) >> 5;
0363
0364     /* Background Filter adaption */
0365
0366     /* Almost always adap bg filter, just simple DT and energy
0367        detection to minimise adaption in cases of strong double talk.
0368        However this is not critical for the dual path algorithm.
0369      */
0370     ec->factor = 0;
0371     ec->shift = 0;
0372     if (!ec->nonupdate_dwell) {
0373         int p, logp, shift;
0374
0375         /* Determine:
0376
0377            f = Beta * clean_bg_rx/P ------ (1)
0378
0379            where P is the total power in the filter states.
0380
0381            The Boffins have shown that if we obey (1) we converge
0382            quickly and avoid instability.
0383
0384            The correct factor f must be in Q30, as this is the fixed
0385            point format required by the lms_adapt_bg() function,
0386            therefore the scaled version of (1) is:
0387
0388            (2^30) * f  = (2^30) * Beta * clean_bg_rx/P
0389            factor      = (2^30) * Beta * clean_bg_rx/P     ----- (2)
0390
0391            We have chosen Beta = 0.25 by experiment, so:
0392
0393            factor      = (2^30) * (2^-2) * clean_bg_rx/P
0394
0395            (30 - 2 - log2(P))
0396            factor      = clean_bg_rx 2                     ----- (3)
0397
0398            To avoid a divide we approximate log2(P) as top_bit(P),
0399            which returns the position of the highest non-zero bit in
0400            P.  This approximation introduces an error as large as a
0401            factor of 2, but the algorithm seems to handle it OK.
0402
0403            Come to think of it a divide may not be a big deal on a
0404            modern DSP, so its probably worth checking out the cycles
0405            for a divide versus a top_bit() implementation.
0406          */
0407
0408         p = MIN_TX_POWER_FOR_ADAPTION + ec->pstates;
0409         logp = top_bit(p) + ec->log2taps;
0410         shift = 30 - 2 - logp;
0411         ec->shift = shift;
0412
0413         lms_adapt_bg(ec, clean_bg, shift);
0414     }
0415
0416     /* very simple DTD to make sure we dont try and adapt with strong
0417        near end speech */
0418
0419     ec->adapt = 0;
0420     if ((ec->lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->lrx > ec->ltx))
0421         ec->nonupdate_dwell = DTD_HANGOVER;
0422     if (ec->nonupdate_dwell)
0423         ec->nonupdate_dwell--;
0424
0425     /* Transfer logic */
0426
0427     /* These conditions are from the dual path paper [1], I messed with
0428        them a bit to improve performance. */
0429
0430     if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
0431         (ec->nonupdate_dwell == 0) &&
0432         /* (ec->Lclean_bg < 0.875*ec->Lclean) */
0433         (8 * ec->lclean_bg < 7 * ec->lclean) &&
0434         /* (ec->Lclean_bg < 0.125*ec->Ltx) */
0435         (8 * ec->lclean_bg < ec->ltx)) {
0436         if (ec->cond_met == 6) {
0437             /*
0438              * BG filter has had better results for 6 consecutive
0439              * samples
0440              */
0441             ec->adapt = 1;
0442             memcpy(ec->fir_taps16[0], ec->fir_taps16[1],
0443                    ec->taps * sizeof(int16_t));
0444         } else
0445             ec->cond_met++;
0446     } else
0447         ec->cond_met = 0;
0448
0449     /* Non-Linear Processing */
0450
0451     ec->clean_nlp = ec->clean;
0452     if (ec->adaption_mode & ECHO_CAN_USE_NLP) {
0453         /*
0454          * Non-linear processor - a fancy way to say "zap small
0455          * signals, to avoid residual echo due to (uLaw/ALaw)
0456          * non-linearity in the channel.".
0457          */
0458
0459         if ((16 * ec->lclean < ec->ltx)) {
0460             /*
0461              * Our e/c has improved echo by at least 24 dB (each
0462              * factor of 2 is 6dB, so 2*2*2*2=16 is the same as
0463              * 6+6+6+6=24dB)
0464              */
0465             if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
0466                 ec->cng_level = ec->lbgn;
0467
0468                 /*
0469                  * Very elementary comfort noise generation.
0470                  * Just random numbers rolled off very vaguely
0471                  * Hoth-like.  DR: This noise doesn't sound
0472                  * quite right to me - I suspect there are some
0473                  * overflow issues in the filtering as it's too
0474                  * "crackly".
0475                  * TODO: debug this, maybe just play noise at
0476                  * high level or look at spectrum.
0477                  */
0478
0479                 ec->cng_rndnum =
0480                     1664525U * ec->cng_rndnum + 1013904223U;
0481                 ec->cng_filter =
0482                     ((ec->cng_rndnum & 0xFFFF) - 32768 +
0483                      5 * ec->cng_filter) >> 3;
0484                 ec->clean_nlp =
0485                     (ec->cng_filter * ec->cng_level * 8) >> 14;
0486
0487             } else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
0488                 /* This sounds much better than CNG */
0489                 if (ec->clean_nlp > ec->lbgn)
0490                     ec->clean_nlp = ec->lbgn;
0491                 if (ec->clean_nlp < -ec->lbgn)
0492                     ec->clean_nlp = -ec->lbgn;
0493             } else {
0494                 /*
0495                  * just mute the residual, doesn't sound very
0496                  * good, used mainly in G168 tests
0497                  */
0498                 ec->clean_nlp = 0;
0499             }
0500         } else {
0501             /*
0502              * Background noise estimator.  I tried a few
0503              * algorithms here without much luck.  This very simple
0504              * one seems to work best, we just average the level
0505              * using a slow (1 sec time const) filter if the
0506              * current level is less than a (experimentally
0507              * derived) constant.  This means we dont include high
0508              * level signals like near end speech.  When combined
0509              * with CNG or especially CLIP seems to work OK.
0510              */
0511             if (ec->lclean < 40) {
0512                 ec->lbgn_acc += abs(ec->clean) - ec->lbgn;
0513                 ec->lbgn = (ec->lbgn_acc + (1 << 11)) >> 12;
0514             }
0515         }
0516     }
0517
0518     /* Roll around the taps buffer */
0519     if (ec->curr_pos <= 0)
0520         ec->curr_pos = ec->taps;
0521     ec->curr_pos--;
0522
0523     if (ec->adaption_mode & ECHO_CAN_DISABLE)
0524         ec->clean_nlp = rx;
0525
0526     /* Output scaled back up again to match input scaling */
0527
0528     return (int16_t) ec->clean_nlp << 1;
0529 }
0530 EXPORT_SYMBOL_GPL(oslec_update);
0531
0532 /* This function is separated from the echo canceller is it is usually called
0533    as part of the tx process.  See rx HP (DC blocking) filter above, it's
0534    the same design.
0535
0536    Some soft phones send speech signals with a lot of low frequency
0537    energy, e.g. down to 20Hz.  This can make the hybrid non-linear
0538    which causes the echo canceller to fall over.  This filter can help
0539    by removing any low frequency before it gets to the tx port of the
0540    hybrid.
0541
0542    It can also help by removing and DC in the tx signal.  DC is bad
0543    for LMS algorithms.
0544
0545    This is one of the classic DC removal filters, adjusted to provide
0546    sufficient bass rolloff to meet the above requirement to protect hybrids
0547    from things that upset them. The difference between successive samples
0548    produces a lousy HPF, and then a suitably placed pole flattens things out.
0549    The final result is a nicely rolled off bass end. The filtering is
0550    implemented with extended fractional precision, which noise shapes things,
0551    giving very clean DC removal.
0552 */
0553
0554 int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx)
0555 {
0556     int tmp;
0557     int tmp1;
0558
0559     if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
0560         tmp = tx << 15;
0561
0562         /*
0563          * Make sure the gain of the HPF is 1.0. The first can still
0564          * saturate a little under impulse conditions, and it might
0565          * roll to 32768 and need clipping on sustained peak level
0566          * signals. However, the scale of such clipping is small, and
0567          * the error due to any saturation should not markedly affect
0568          * the downstream processing.
0569          */
0570         tmp -= (tmp >> 4);
0571
0572         ec->tx_1 += -(ec->tx_1 >> DC_LOG2BETA) + tmp - ec->tx_2;
0573         tmp1 = ec->tx_1 >> 15;
0574         if (tmp1 > 32767)
0575             tmp1 = 32767;
0576         if (tmp1 < -32767)
0577             tmp1 = -32767;
0578         tx = tmp1;
0579         ec->tx_2 = tmp;
0580     }
0581
0582     return tx;
0583 }
0584 EXPORT_SYMBOL_GPL(oslec_hpf_tx);
0585
0586 MODULE_LICENSE("GPL");
0587 MODULE_AUTHOR("David Rowe");
0588 MODULE_DESCRIPTION("Open Source Line Echo Canceller");
0589 MODULE_VERSION("0.3.0");