import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/BlogTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p>{`Tacotron is a commonly used Text-to-Speech architecture. It is a very flexible
alternative over traditional solutions. It only requires text and corresponding
voice clips to train the model. It avoids the toil of fine-grained annotation
of the data. However, Tacotron might also be very time demanding to train,
especially if you don’t know the right hyperparameters to begin with. Here, I
like to share a gradual training scheme to ease the training difficulty. In my
experiments, it provides faster training, tolerance for various hyperparameter
values, and more time with your family.`}</p>
    <p>{`In summary, Tacotron is an Encoder-Decoder architecture with Attention. it
takes a sentence as a sequence of characters (or phonemes) and it outputs
sequence of spectrogram frames to be ultimately converted to speech with an
additional vocoder algorithm (e.g. Griffin-Lim or WaveRNN). There are two
versions of Tacotron, Tacotron and Tacotron2. Tacotron is a complicated
architecture but it has fewer model parameters as compared to Tacotron2.
Tacotron2 is much simpler but it is about 4x larger (7m vs 24m parameters). To
be clear, so far I have only used this gradual training method with Tacotron
and I am about to begin to experiment with Tacotron2 soon.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "85.6%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAARCAYAAADdRIy+AAAACXBIWXMAAB7CAAAewgFu0HU+AAAC8UlEQVQ4y11UiVbjMAzs/38Xb9mFhZYWKK9X0tyX48Q5mqTHrKTQAmvePJc8ezSSRp4opRDHMXKVo6kb9F2PqqpQFAXapkXbtrLXdS24XC7gxft3XL9NkiSB53nYu3vEaQxdajRNg67rUJryBibjQOfzWS7zfsUPQlZQlAXerXc8Lh4xX73ciN3IhRM6iNIIZVkK4fXy/+v6fWLbNna7HVzHxfJ9if3eEXV930PnWsrBio/Ho3xjRbzzOYfuMHI6d1NoWZYQhmGIxWIO13VwOp0EhlJlQg7Ai0n599AP2Cx3mP55wevzO/JMCxkHm2itkaYpPD8QcLSaFB0Oh8/mlKKQa8hkHEjIaefzZWnQD4MEE4UeKVpvLTyvFe5mLqbbAhvLQxhncGIDK8jhpyMx13ugyxzodRPh7nmN33Mbez9FWWicmVDrHEoXWLkav55XmK9DRIlCrkt4kYbtJYiyktI3opCVsEpWtaNyhdHYsL7vRoVcI7aNzjMkUYCurUkJpUxqmrqSyAMdZlVXhZz+2snwMN9iwQLSgkiLkTAIfCw/PjD/2GPjpNh5CqnSZCWDRBnEqkSmzVhHIuVAx9MR1WEgS6XwkxymZt8eRsKCFERxgr9vLu5fLIEfJsiI1PJSgRsqIWRwJyOfhsHyP0FDsXHR1u3ow1xlCMIYr9sY97MNZqtAGpJRt50wh00FDySlsY6swhQV9pYDa0se3thISClbSQjrppbDMX3cri14TiAdraua5psslWRURyM24hpeR0+M/Pn3fVom5/OJDnfYrnw8Pbxh+WZTgIq6NiCJqRyRooZVcoG7y5dSmv84DIhhnOPL+euBmByHHi29MG7eY7YOsIlqmIZGbzhBVR1iIjPtIMq4w7wMlcN/mkK5HlmOGqj1FyFH7SgdQxG15wpqGjl+sipSaihdTp/9x+DFFppN57B2Nk2Wkv9vKUsq7C3fg//2CuPsMZAP25ZqVhNIbdscfrwy3dAh1CSg0hL4+/v4D6XLDgN1/+3QAAAAAElFTkSuQmCC')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/79970935c0194dc08609144640111d32/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-architecture.png",
          "srcSet": ["/static/79970935c0194dc08609144640111d32/43fa5/blog-tts-gradual-training-with-tacotron-for-faster-convergence-architecture.png 250w", "/static/79970935c0194dc08609144640111d32/c6e3d/blog-tts-gradual-training-with-tacotron-for-faster-convergence-architecture.png 500w", "/static/79970935c0194dc08609144640111d32/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-architecture.png 1000w", "/static/79970935c0194dc08609144640111d32/9aaa6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-architecture.png 1447w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Here is the trick. Tacotron has a parameter called “r” which defines the number
of spectrogram frames predicted per decoder iteration. It is a useful parameter
to reduce the number of computations since the larger r, the fewer the decoder
iterations. But setting the value to high might reduce the performance as well.
Another benefit of higher r value is that the alignment module stabilizes much
faster. If you talk someone who used Tacotron, they would probably know what
struggle the attention means. So finding the right trade-off for r is a great
deal. In the original Tacotron paper, authors used r as 2 for the best-reported
model. They also emphasize the challenge of training the model with r=1.`}</p>
    <p>{`Gradual training comes to the rescue at this point. What it means is that we
set r initially large, such as 7. Then, as the training continues, we reduce it
until the convergence. This simple trick helps quite magically to solve two
main problems. The first, it helps the network to learn the monotonic attention
after almost the first epoch. The second, it expedites convergence quite much.
As a result, the final model happens to have more stable and resilient
attention without any degrigation of performance. You can even eventually let
the network to train with r=1 which was not even reported in the original
paper.`}</p>
    <p>{`Here, I like to share some results to prove the effectiveness. I used LJspeech
dataset for all the results. The training schedule can be summarized as
follows. (You see I also change the batch_size but it is not necessary if you
have enough GPU memory.)`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`{
“gradual_training”: [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]] # [start_step, r, batch_size]
}
`}</code></pre>
    <p>{`Below you can see the attention at validation time after just 1K iterations
with the training schedule above.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "62.4%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAMCAIAAADtbgqsAAAACXBIWXMAAAsTAAALEwEAmpwYAAACQ0lEQVQoz2WRzWsTQRjG32RTQRGiNsmhCAYP4smDFQs9bHaz2U0ETTebnU1iIWbTSKxYLNgeFP0fVCgoCIKIJz1ZUDHNpUYp5mPToIlNm8R+Jp4KKkTJhzNxW1GHh4dl5/m97zszkJnL0+BlQGQMInHw9SSyBpGGkbNHZNYz7rCET7PXhwavuodubtaza9u3u91up9uCTELjDH6eQn/JhFyguIUr8fFRtznotE84T1xjBsa8w7cqGx/mG496cBtyyUUnSC6D7DLKxLEomcedj11kfJPcHsSBnzcFeSNiwSeb1Y9rpWfrsxhuYzg7l9dhwiPOpLjARx+NM+FpoS/AAynKm6P83lEGZGSNabXle58SBO50YOFlGtfWGxokASSPJUiHpxmryuOiFJ5fFg5Ehf0XnEZFscQyK5U7+TcYbmE4NbuASZ7y4xBrv3xSvOEcmzwzEHIRcucgJKAwIKFDanapejeV0junExoLkgPQOYfqiU8NuqcchyOENKJd0kUp+Mz4FVC/mitWZ16/1eFschFHY+cD3kDUvQ/hUfHwfy5v5y502KJqhcqD5+8I3MadX+RFIRIaDrFAEnyfwpuUf1+OQoJJwQMiW7SgVR4+mdfhXKEsnFJpkDij3wlY0v/iQOaMiAZRskYK75cf30/qcKZY9dov+fvDkk2VrKrPEsGOQ35b9Pf37p+Rg+HI8Yliuvx05hWBW21ofm9u1Rqb1bquSr3++UuttFrSllbL62SrsoV9A3u1Ua81fjR/ft3+1u2tX2XoNquU0C9fAAAAAElFTkSuQmCC')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/646c2bcb73bc464ecee4064944d76d75/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png",
          "srcSet": ["/static/646c2bcb73bc464ecee4064944d76d75/43fa5/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png 250w", "/static/646c2bcb73bc464ecee4064944d76d75/c6e3d/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png 500w", "/static/646c2bcb73bc464ecee4064944d76d75/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png 1000w", "/static/646c2bcb73bc464ecee4064944d76d75/2e9ed/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png 1500w", "/static/646c2bcb73bc464ecee4064944d76d75/0d4f8/blog-tts-gradual-training-with-tacotron-for-faster-convergence-attention.png 1600w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Next, let’s check the model training curve and convergence.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "24.400000000000002%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAFCAYAAABFA8wzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAA3ElEQVQY00WQUW4DIQxEuf81W3VFSKIlYIwxZGqzVfphITzM85iQ7gm3nwP5zJhrYqpirQVujFtKeHwd6NSw3m/MObdGRIgx4n5EtNZ2X83nFbh30DODc4Guf8HBPATlO0Jq28N06Ed3UE1PdBGMMT4VereGDqzC0HpNm5bCkwwzSiEo8R7md/1LQwaUF2FYifnFgXYGZrYpfUMm21naBe7XRNdHrts4HGwBxKpW2hC191tzHwuCJ+q2djeYr9/sv5o9nraeaw709bxH5wv0ODHlSuS+rbtWKvgs+AXDIoV6kyTZaQAAAABJRU5ErkJggg==')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/187a960e511c11adecc071701305a1cf/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-graph.png",
          "srcSet": ["/static/187a960e511c11adecc071701305a1cf/43fa5/blog-tts-gradual-training-with-tacotron-for-faster-convergence-graph.png 250w", "/static/187a960e511c11adecc071701305a1cf/c6e3d/blog-tts-gradual-training-with-tacotron-for-faster-convergence-graph.png 500w", "/static/187a960e511c11adecc071701305a1cf/da8b6/blog-tts-gradual-training-with-tacotron-for-faster-convergence-graph.png 1000w", "/static/187a960e511c11adecc071701305a1cf/a6d44/blog-tts-gradual-training-with-tacotron-for-faster-convergence-graph.png 1012w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Here you can see here the model jumping from r=7 to r=5 at about 10k
iterations. There is obvious easy gain after the jump.`}</p>
    <p>{`You can listen to `}<a parentName="p" {...{
        "href": "https://soundcloud.com/user-565970875/sets/gradual-training-results"
      }}>{`voice
examples`}</a>{`
generated with the final model using GriffinLim vocoder. I’d say the quality of
these examples is quite good to my ear.`}</p>
    <p>{`This is a short post, but if you like to replicate the results here, you can
visit our repo `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`TTS`}</a>{` and just run the training
with the provided config.json file. Hopefully, the documentation on the repo
would help you in getting started. Otherwise, you can always ask for help
creating an issue or on our `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS/discussions"
      }}>{`TTS discussion
page`}</a>{`. There are also some other
cool things in the repo that I will also write about in the future. Until next
time!`}</p>


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      