import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/BlogTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p>{`In this post, I’d like to introduce two methods that, in my experience, worked
well for better attention alignment in Tacotron models. If you like to try your
own, you can visit `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`Coqui TTS`}</a>{`. The first
method is `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/1907.09006"
      }}>{`Bidirectional Decoder`}</a>{` and the
second is `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/1308.0850"
      }}>{`Graves Attention`}</a>{` (Gaussian
Attention) with small tweaks.`}</p>
    <h3 {...{
      "id": "bidirectional-decoder",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#bidirectional-decoder",
        "aria-label": "bidirectional decoder permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Bidirectional Decoder`}</h3>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "46.4%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAJCAYAAAAywQxIAAAACXBIWXMAAAsTAAALEwEAmpwYAAABxElEQVQoz1VSyY7TQBDN/x858QdI3EBzQSAktkEsChKZiIyTjOPYcWLHjp12L+7Nj7KdBFHS62rV8rqqqyYgcb4j+AG9dARLNus8OrLl+wTheoV4u8E6WICzMzqr4Y26RNPZjXrSJ/nxPrgeXz/H5vOrwWYIWgnwmEj2IbRoUC9/gpcH8N0KKgthjIFz7kY6IT5wznEqS1giWH19i3T+HaeH9zhO72CsR0OEMvkDyys06ylkvkUdLSDSFfktlJLU3Ug6cUTScIEkPYCxBu3YNeosRpkswaWGIK2JxAkGsZlBH7dQu2DQHcWzikMJPRIK7VBJh7DgSMkR7Eu0ph3a31YdSt5CWYXWaRiCuNwlaeMNRN0iC2uwQo4tWyrR04cJY8FaSrJjhU/1HotTRDaDH9USC5agNhwfj3NEIsfsHGHJ0yFWcAmtzXUo3W1SvbwI7vFmN8OXwxTvok/g1MGMxQhkirNX+NWECGWGByIM2I6IqFopqSj3j3BYkcvKPJvf4WV4fxt7YzS+RXP8TgLEeYoPT1MEZYKgOSCRxTDh68oMLfdHv4f+sotXMTT+Hv1D+alEXhTISgJpJgQc5Rlv6c3/9/AvW26v6PezQ78AAAAASUVORK5CYII=')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/e7f08e2cac087f13ae1c38673c41f046/da8b6/blog-tts-two-methods-for-better-attention-in-tacotron-architecture.png",
          "srcSet": ["/static/e7f08e2cac087f13ae1c38673c41f046/43fa5/blog-tts-two-methods-for-better-attention-in-tacotron-architecture.png 250w", "/static/e7f08e2cac087f13ae1c38673c41f046/c6e3d/blog-tts-two-methods-for-better-attention-in-tacotron-architecture.png 500w", "/static/e7f08e2cac087f13ae1c38673c41f046/da8b6/blog-tts-two-methods-for-better-attention-in-tacotron-architecture.png 1000w", "/static/e7f08e2cac087f13ae1c38673c41f046/d2a27/blog-tts-two-methods-for-better-attention-in-tacotron-architecture.png 1419w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Bidirectional decoding uses an extra decoder which takes the encoder outputs in
the reverse order and then, there is an extra loss function that compares the
output states of the forward decoder with the backward one. With this
additional loss, the forward decoder models what it needs to expect for the
next iterations. In this regard, the backward decoder punishes bad decisions of
the forward decoder and vice versa.`}</p>
    <p>{`Intuitively, if the forward decoder fails to align the attention, that would
cause a big loss and ultimately it would learn to go monotonically through the
alignment process with a correction induced by the backward decoder. Therefore,
this method is able to prevent “catastrophic failure” where the attention falls
apart in the middle of a sentence and it never aligns again.`}</p>
    <p>{`At the inference time, the paper suggests to only use the forward decoder and
demote the backward decoder. However, it is possible to think more elaborate
ways to combine these two models.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "727px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "30%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAGCAYAAADDl76dAAAACXBIWXMAAAsTAAALEwEAmpwYAAABh0lEQVQY00WQSy9DURSFT9GHUmLiXxgYVFT0oW5bWnr1treP0EETiURLNZHoa2Ri5g+YGogYGRRRDeJRIZQqEkkHfgRVatn3Ek6ys5OzvrP2XofF+pMwswk4VAHYlX7YW6ia/eDUItzdETy+HAP4QqlQAaclpjUg6xLnIJ5TeMHrIji/3iaqBjbTuwgDc8LKBFiYh2qCSoBJ4YFTG0KxWoR0bvbLMEma0vvHDVE3MjdcqknsXOyijnewWf0SgW6aKsLW5AOnCcDWFiRYAN8xhb3KPRpkWDooY5h5wSlFcO0hSuCHTeGTOY8ujI3DE7ziEyzaJxnycKgpii4ImyYgm1vpMd8VRv7ygaKQYaFM8Xw0VKS4NFwblHmJEzrD2Myd4a3RAJvuSUDPRiiiIK9vZONyNzAX/auIQv5OjnxNfYDuBn/1H44nzo1RWmZr/Qi1Om24Gl9D1JJGYiiLuDmD2GASC5Ys5o0pJMeWcXv6JBs+l6qIm9KkZf456nN0l/Kt4CR3hXrtA998YAbjGk7mYQAAAABJRU5ErkJggg==')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/927f7566bf5e778d1a19f43f88a53bae/b5a20/blog-tts-two-methods-for-better-attention-in-tacotron-alignment.png",
          "srcSet": ["/static/927f7566bf5e778d1a19f43f88a53bae/43fa5/blog-tts-two-methods-for-better-attention-in-tacotron-alignment.png 250w", "/static/927f7566bf5e778d1a19f43f88a53bae/c6e3d/blog-tts-two-methods-for-better-attention-in-tacotron-alignment.png 500w", "/static/927f7566bf5e778d1a19f43f88a53bae/b5a20/blog-tts-two-methods-for-better-attention-in-tacotron-alignment.png 727w"],
          "sizes": "(max-width: 727px) 100vw, 727px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`There are 2 main pitfalls of this method. The first, due to additional
parameters of the backward decoder, it is slower to train this model (almost
2x) and this makes a huge difference especially when the reduction rate is low
(number of frames the model generates per iteration). The second, if the
backward decoder penalizes the forward one too harshly, that causes overall
prosody degradation. Due to this the paper suggests activating the additional
loss just for fine-tuning.`}</p>
    <p>{`My experience is that Bidirectional training is quite robust against alignment
problems and it is especially useful if your dataset is hard. It also almost
aligns after the first epoch. Yes, at inference time, it sometimes causes
pronunciation problems but I solved this by doing the opposite of the paper’s
suggestion. Just for an epoch I finetuned the network without the additional
loss and everything started to work well.`}</p>
    <h3 {...{
      "id": "graves-attention",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#graves-attention",
        "aria-label": "graves attention permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Graves Attention`}</h3>
    <p>{`Tacotron uses Bahdenau Attention which is a content-based attention method. It
does not consider location information. Therefore, it needs to learn the
monotonicity of the alignment just looking into the content which is a hard
problem. Tacotron2 uses Location Sensitive Attention which takes into account
the previous attention weights. By doing so, it learns the monotonic
constraint. But it does not solve all of the problems and you can still
experience failures with long or out of domain sentences.`}</p>
    <p>{`Graves Attention is an alternative that uses content information to decide how
far it needs to go on the alignment per iteration. It does this by using a
mixture of Gaussian distributions.`}</p>
    <p>{`Graves Attention takes the context vector of time t-1 and passes it through
couple of fully connected layers (`}{`[FC > ReLU > FC]`}{` in our model) and estimates
step-size, variance and distribution weights for time t. Then the estimated
step-size is used to update the mean of Gaussian modes. Analogously, mean is
the point of interest t the alignment path, variance is attention window over
this point of interest and distribution weight is the importance of each
distribution head.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "772px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "29.599999999999998%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAGCAYAAADDl76dAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAnklEQVQY042QjQqEIBCEff+XDAuVyLS0Hytyjl3ouOqCFoYFR7+dVeBSOWfu27YhhABjDKvv+5P/VOIJuK4rnHOQUkJrjZTS1//VdYi4wg6TAJSqLEtUVYW2bW93/iV+TDjPMwNjjBiGAfu+nx7TBq8SHp2A3nsURQFrLeq6Rtd1aJqGEyul+I9frUyiBMfa0zSx6GwcRwaTtyzLDfgBqTjW6fhwktgAAAAASUVORK5CYII=')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/d1ceb96f942ada9348462e66567f5718/5a533/blog-tts-two-methods-for-better-attention-in-tacotron-equation.png",
          "srcSet": ["/static/d1ceb96f942ada9348462e66567f5718/43fa5/blog-tts-two-methods-for-better-attention-in-tacotron-equation.png 250w", "/static/d1ceb96f942ada9348462e66567f5718/c6e3d/blog-tts-two-methods-for-better-attention-in-tacotron-equation.png 500w", "/static/d1ceb96f942ada9348462e66567f5718/5a533/blog-tts-two-methods-for-better-attention-in-tacotron-equation.png 772w"],
          "sizes": "(max-width: 772px) 100vw, 772px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Formulated as I compute the alignment in my implementation. Here g, b, and k
are intermediate values, 𝛿 is the step size, σ is the variance, and
w`}<sub>{`k`}</sub>{` is the distribution weight for the GMM node k.`}</p>
    <p>{`Some other versions are explained `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/1910.10288"
      }}>{`here`}</a>{` but
so far I found the above formulation works for me the best, without any NaNs in
training. I also realized that with the best-claimed method in this paper, one
of the distribution nodes overruns the others in the middle of the training and
basically, attention starts to run on a single Gaussian head.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "40%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAICAYAAAD5nd/tAAAACXBIWXMAAAsTAAALEwEAmpwYAAACDElEQVQoz22S7U+SURiHn17c+iOaK821NCqtZTM/JOBWBjlS82WTIa2IF9MgNSFAV5qIzXyBGrVsi61af5QfWrlJKS8Cz/Mgz9URXJ/6cG3n/HbtPtv9O5Kj+SmGI73opT48ej8/5d+YVxNccc2hC88xtJ4gW0jj0PnQ1/TTcXwQt2GaFBm6Psa4sDTH2fkF7FNxFK2IZG3wcE26LbjFSIuPzcwuxjfrnLOHqAsF6Z5/x590BmvDKFclC63Cs7Y8YUvZoyO+TH1ohpNLYSzBGAVVQQp0LzBU52TolINQT5St7TS29x/o9EfQv4zgXPxEdidHoP81g3UuBmsfEBReKpvH/jnGzcgrbqxEeBTbQM6LgSW1hCqrqIpKSSlRLmsUiyqFgkJRIMsKmqZR9ZR/3kEmKwr5QhU5r1Yy6WBAhf1yJQCBVj1r++Jarmb/9w6hfIgYONsb5X7jGCNNXiKOFXZLKca+fWFgYh1zNMHzja8ocpqwZQl742NsTT4izmVypPEkk9jGo9yLxwh/T4q3RSnDZ9xi0WaBiZFmL5vFHQxv19CZpqidDWJaFKVkRCn1HlGKWdCF9bKXX2oe/bMVLrWNcfrhLJbpuChFRnK1B+g8MUDnsV5Grwf4kdrBsrpG690gFydm6JtPkE5lcbb5Mdb0YTzag1t425kcdyZXaRdf6Pz4C4Yn48h7Cn8BrCfGT+YhPpsAAAAASUVORK5CYII=')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/982ff79cfecd1736d016d5821d5b7869/da8b6/blog-tts-two-methods-for-better-attention-in-tacotron-alignments.png",
          "srcSet": ["/static/982ff79cfecd1736d016d5821d5b7869/43fa5/blog-tts-two-methods-for-better-attention-in-tacotron-alignments.png 250w", "/static/982ff79cfecd1736d016d5821d5b7869/c6e3d/blog-tts-two-methods-for-better-attention-in-tacotron-alignments.png 500w", "/static/982ff79cfecd1736d016d5821d5b7869/da8b6/blog-tts-two-methods-for-better-attention-in-tacotron-alignments.png 1000w", "/static/982ff79cfecd1736d016d5821d5b7869/16caf/blog-tts-two-methods-for-better-attention-in-tacotron-alignments.png 1465w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`The benefit of using GMM is to have more robust attention. It is also
computationally light-weight compared to both bidirectional decoding and normal
location attention. Therefore, you can increase your batch size and possibly
converge faster.`}</p>
    <p>{`The downside is that, although my experiments are not complete, GMM’s not
provided slightly worse prosody and naturalness compared to the other methods.`}</p>
    <h3 {...{
      "id": "comparison",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#comparison",
        "aria-label": "comparison permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Comparison`}</h3>
    <p>{`Here I compare Graves Attention, Bidirectional Decoding, and Location Sensitive
Attention trained on `}<a parentName="p" {...{
        "href": "https://keithito.com/LJ-Speech-Dataset/"
      }}>{`LJSpeech`}</a>{`
dataset. For the comparison, I used the set of sentences provided by `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/1905.09263"
      }}>{`this
work`}</a>{`. There are in total of 50 sentences.`}</p>
    <p>{`Out of these 50 sentences Bidirectional Decoding has 1 failure, Graves
attention has 6 failures, Location Sensitive Attention has 18, and Location
Sensitive Attention with inference time windowing has 11.`}</p>
    <p>{`In terms of prosodic quality, in my opinion, Location Sensitive Attention >
Bidirectional Decoding > Graves Attention > Location Sensitive Attention with
Windowing. However, I should say the quality difference is hardly observable in
LJSpeech dataset. I also need to point out that, it is a hard dataset.`}</p>
    <p>{`If you like to try these methods, all these are implemented in `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`Coqui
TTS`}</a>{`. Give them a try!`}</p>


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      