import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/BlogTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p>{`👉 `}<a parentName="p" {...{
        "href": "/"
      }}>{`Try the YourTTS demo`}</a></p>
    <p>{`👉 Visit the YourTTS `}<a parentName="p" {...{
        "href": "https://edresson.github.io/YourTTS/"
      }}>{`project page`}</a></p>
    <p>{`👉 Try YourTTS on `}<a parentName="p" {...{
        "href": "https://colab.research.google.com/drive/1ftI0x16iqKgiQFgTjTDgRpOM1wC1U-yS?usp=sharing"
      }}>{`Colab`}</a></p>
    <p>{`👉 Try voice conversion with YourTTS on `}<a parentName="p" {...{
        "href": "https://colab.research.google.com/drive/1gjdwOKCZuavPn_5oy8QA01sKmXpEq5AZ?usp=sharing"
      }}>{`Colab`}</a></p>
    <p>{`The recent surge of new end-to-end deep learning models has enabled new and
exciting Text-to-Speech (TTS) use-cases with impressive natural-sounding
results. However, most of these models are trained on massive datasets
(20-40 hours) recorded with a single speaker in a professional environment. In
this setting, expanding your solution to multiple languages and speakers is not
feasible for everyone. Moreover, it is particularly tough for low-resource
languages not commonly targeted by mainstream research. To get rid of these
limitations and bring zero-shot TTS to low resource languages, we built
`}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2112.02418"
      }}>{`YourTTS`}</a>{`, which can synthesize voices in
multiple languages and reduce data requirements significantly by transferring
knowledge among languages in the training set. For instance, we can easily
introduce Brazilian Portuguese to the model with a single speaker dataset by
co-training with a larger English dataset. It makes the model speak Brazilian
Portuguese with voices from the English dataset, or we can even introduce new
speakers by zero-shot learning on the fly.`}</p>
    <p>{`In “`}<strong parentName="p">{`YourTTS`}</strong>{`: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice
Conversion for everyone” we introduce a model with the following capabilities:`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`Multi-Lingual TTS`}</strong>{`: Synthesizing speech in multiple languages with a single
model.`}</li>
      <li parentName="ul"><strong parentName="li">{`Multi-Speaker TTS`}</strong>{`: Synthesizing speech with different voices with a single
model.`}</li>
      <li parentName="ul"><strong parentName="li">{`Zero-Shot learning`}</strong>{`: Adapting the model to synthesize the speech of a novel
speaker without re-training the model.`}</li>
      <li parentName="ul"><strong parentName="li">{`Speaker/language adaptation`}</strong>{`: Fine-tuning a pre-trained model to learn a
new speaker or language. (Learn Turkish from a relatively smaller dataset
by transferring knowledge from learned languages)`}</li>
      <li parentName="ul"><strong parentName="li">{`Cross-language voice transfer`}</strong>{`: Transferring a voice from its original
language to a different language. (Using the voice of an English speaker in
French)`}</li>
      <li parentName="ul"><strong parentName="li">{`Zero-shot voice conversion`}</strong>{`: Changing the voice of a given speech clip.`}</li>
    </ul>
    <div align="center">
      <p><span parentName="p" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "1000px"
          }
        }}>{`
      `}<span parentName="span" {...{
            "className": "gatsby-resp-image-background-image",
            "style": {
              "paddingBottom": "91.19999999999999%",
              "position": "relative",
              "bottom": "0",
              "left": "0",
              "display": "block"
            }
          }}></span>{`
  `}<img parentName="span" {...{
            "className": "gatsby-resp-image-image",
            "alt": "IMAGE",
            "title": "IMAGE",
            "src": "/static/b89c253536048e68d39ccf79d68bea74/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-modes.png",
            "srcSet": ["/static/b89c253536048e68d39ccf79d68bea74/43fa5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-modes.png 250w", "/static/b89c253536048e68d39ccf79d68bea74/c6e3d/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-modes.png 500w", "/static/b89c253536048e68d39ccf79d68bea74/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-modes.png 1000w", "/static/b89c253536048e68d39ccf79d68bea74/a2880/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-modes.png 1312w"],
            "sizes": "(max-width: 1000px) 100vw, 1000px",
            "style": {
              "width": "100%",
              "height": "100%",
              "margin": "0",
              "verticalAlign": "middle",
              "position": "absolute",
              "top": "0",
              "left": "0"
            },
            "loading": "lazy"
          }}></img>{`
    `}</span></p>
    </div>
    <h3 {...{
      "id": "model-architecture",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#model-architecture",
        "aria-label": "model architecture permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Model Architecture`}</h3>
    <p>{`YourTTS is an extension of our previous work `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2104.05557"
      }}>{`SC-GlowTTS`}</a>{`.
It uses the `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2106.06103"
      }}>{`VITS`}</a>{` (Variational Inference with
adversarial learning for end-to-end Text-to-Speech) model as the backbone architecture and
builds on top of it. We use a larger text encoder than the original model.
Also, YourTTS employs a separately trained speaker encoder model to compute the
speaker embedding vectors (d-vectors) to pass speaker information to the rest
of the model. We use the `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2009.14153"
      }}>{`H/ASP`}</a>{` model as the
speaker encoder architecture. See the figure below for the overall model
architecture in training (right) and inference (left).`}</p>
    <div align="center">
      <p><span parentName="p" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "1000px"
          }
        }}>{`
      `}<span parentName="span" {...{
            "className": "gatsby-resp-image-background-image",
            "style": {
              "paddingBottom": "62.4%",
              "position": "relative",
              "bottom": "0",
              "left": "0",
              "display": "block"
            }
          }}></span>{`
  `}<img parentName="span" {...{
            "className": "gatsby-resp-image-image",
            "alt": "IMAGE",
            "title": "IMAGE",
            "src": "/static/aeec1cf73c251540bf76af923d1737af/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png",
            "srcSet": ["/static/aeec1cf73c251540bf76af923d1737af/43fa5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png 250w", "/static/aeec1cf73c251540bf76af923d1737af/c6e3d/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png 500w", "/static/aeec1cf73c251540bf76af923d1737af/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png 1000w", "/static/aeec1cf73c251540bf76af923d1737af/2e9ed/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png 1500w", "/static/aeec1cf73c251540bf76af923d1737af/39a40/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-inference.png 1874w"],
            "sizes": "(max-width: 1000px) 100vw, 1000px",
            "style": {
              "width": "100%",
              "height": "100%",
              "margin": "0",
              "verticalAlign": "middle",
              "position": "absolute",
              "top": "0",
              "left": "0"
            },
            "loading": "lazy"
          }}></img>{`
    `}</span></p>
    </div>
    <p>{`VITS is a peculiar TTS model as it employs different deep-learning techniques
together (adversarial learning, normalizing flows, variational auto-encoders,
transformers) to achieve high-quality natural-sounding output. It is mainly
built on the `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2005.11129"
      }}>{`GlowTTS`}</a>{` model. The GlowTTS
is light, robust to long sentences, converges rapidly, and is backed up by
theory since it directly maximizes the log-likelihood of speech with the
alignment. However, its biggest weakness is the lack of naturalness and
expressivity of the output.`}</p>
    <p>{`VITS improves on it by introducing specific updates. First, it replaces the
duration predictor with a stochastic duration predictor that better models the
variability in speech. Then, it connects a HifiGAN vocoder to the decoder’s
output and joins the two with a variational autoencoder (VAE). That allows the
model to train in an end2end fashion and find a better intermediate
representation than traditionally used mel-spectrograms. This results in high
fidelity and more precise prosody, achieving better MOS values reported in the
paper.`}</p>
    <p>{`Note that both GlowTTS and VITS implementations are available on 🐸TTS.`}</p>
    <h3 {...{
      "id": "dataset",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#dataset",
        "aria-label": "dataset permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Dataset`}</h3>
    <p>{`We combined multiple datasets for different languages. We used `}<a parentName="p" {...{
        "href": "https://datashare.ed.ac.uk/handle/10283/2651"
      }}>{`VCTK`}</a>{`
and `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/1904.02882"
      }}>{`LibriTTS`}</a>{` for English (multispeaker datasets),
`}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2005.05144"
      }}>{`TTS-Portuguese Corpus`}</a>{` (TPC) for Brazilian
Portuguese, and the French subset of the `}<a parentName="p" {...{
        "href": "https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/"
      }}>{`M-AILABS`}</a>{`
dataset (FMAI).`}</p>
    <p>{`We resample the audio clips to 16 kHz, apply voice activity detection to remove
silences and apply `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS/blob/aa2450e8f28b4367791a7988cf87e395e501ce67/TTS/utils/audio.py#L787"
      }}>{`RMS volume normalization`}</a>{`
before passing them to the speaker encoder.`}</p>
    <h3 {...{
      "id": "training",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#training",
        "aria-label": "training permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Training`}</h3>
    <p>{`We train YourTTS incrementally, starting from a single speaker English dataset
and adding more speakers and languages along the way. We start from a
pre-trained model on the LJSpeech dataset for 1M steps and continue with the
VCTK dataset for 200K steps. Next, we randomly initialize the new layers
introduced by the YourTTS model on the VITS model. Then we add the other
datasets one by one and train for `}{`~`}{`120K steps with each new dataset.`}</p>
    <p>{`Before we report results on each dataset, we also fine-tune the final model with
speaker encoder loss (SCL) on that particular dataset. SCL compares output
speech embeddings with the ground truth embeddings computed by the speaker
encoder with cosine similarity loss.`}</p>
    <p>{`We used a single V100 GPU and used a batch size of 64. We used the AdamW
optimizer with beta values 0.8 and 0.99 and a learning rate of 0.0002 decaying
exponentially with gamma 0.999875 per iteration. We also employed a weight
decay of 0.01.`}</p>
    <h3 {...{
      "id": "results",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#results",
        "aria-label": "results permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Results`}</h3>
    <p>{`We compute “mean opinion score” (MOS) tests and similarity MOS tests to evaluate the model
performance. We also use `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2104.05557"
      }}>{`speaker encoder cosine similarity (SECS)`}</a>{` to
measure the similarity between the predicted outputs and the actual audio clips
of a target speaker. We used a 3rd party library for SECS to be compatible with
the previous work. We avoid details of our experiments for the sake of brevity.
Please refer to the paper to see the details.`}</p>
    <div align="center">
      <p><span parentName="p" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "1000px"
          }
        }}>{`
      `}<span parentName="span" {...{
            "className": "gatsby-resp-image-background-image",
            "style": {
              "paddingBottom": "27.200000000000003%",
              "position": "relative",
              "bottom": "0",
              "left": "0",
              "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAFCAYAAABFA8wzAAAACXBIWXMAABYlAAAWJQFJUiTwAAAA1UlEQVQY0z1QR6rFMBDL/c8V3iKN9N57gWSRA+gjQf7CyIzVPEZd11iWBcdxIMsyoeu6mgVBgKqqUJYlhmGAZVkoigK+7yNNUzRNgyiKEIahtOQbJP1+PxE9z5PYtm0wiG9JkuA8TwVt2yb8Cuz7jvu+8TyPONd1waCASeu6yoB30zTViMYMattWAWzOxmxIAxp9hsT3fWHkeS4xE1n5M5rnWcj3rusU9K2AyBDO+XVy4jhWqMG/s8U4jtpD3/cSTNP0vwLOSeauKHYcR8jDHZJPY3L/APdXZAuNFp69AAAAAElFTkSuQmCC')",
              "backgroundSize": "cover",
              "display": "block"
            }
          }}></span>{`
  `}<img parentName="span" {...{
            "className": "gatsby-resp-image-image",
            "alt": "IMAGE",
            "title": "IMAGE",
            "src": "/static/0bdf0c67d1d931092fb99c86800026c8/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png",
            "srcSet": ["/static/0bdf0c67d1d931092fb99c86800026c8/43fa5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 250w", "/static/0bdf0c67d1d931092fb99c86800026c8/c6e3d/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 500w", "/static/0bdf0c67d1d931092fb99c86800026c8/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 1000w", "/static/0bdf0c67d1d931092fb99c86800026c8/2e9ed/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 1500w", "/static/0bdf0c67d1d931092fb99c86800026c8/9fabd/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 2000w", "/static/0bdf0c67d1d931092fb99c86800026c8/040e1/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-1.png 2458w"],
            "sizes": "(max-width: 1000px) 100vw, 1000px",
            "style": {
              "width": "100%",
              "height": "100%",
              "margin": "0",
              "verticalAlign": "middle",
              "position": "absolute",
              "top": "0",
              "left": "0"
            },
            "loading": "lazy"
          }}></img>{`
    `}</span></p>
    </div>
    <p>{`Table (1) above shows our results on different datasets. Exp1 is trained with
only the VCTK. Exp2. is with the VCTK and TPC. Then, we add the FMAI, LibriTTS
for Exp3. and Exp4, respectively. The ground truth row reports the values for
the real speaker clips in respective datasets. Finally, we compare our results
with `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2005.08484"
      }}>{`AttentronZS`}</a>{` and `}<a parentName="p" {...{
        "href": "https://arxiv.org/abs/2104.05557"
      }}>{`SC-GlowTTS`}</a>{`.
Note that SC-GlowTTS is our previous work which leads the way to YourTTS. You
can find its implementation under `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`🐸TTS`}</a>{`. We
achieve significantly better results than the works compared to in our
experiments. MOS values are on-par or better than the ground truth in some
cases, which is surprising even for us to see.`}</p>
    <div align="center">
      <p><span parentName="p" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "1000px"
          }
        }}>{`
      `}<span parentName="span" {...{
            "className": "gatsby-resp-image-background-image",
            "style": {
              "paddingBottom": "11.6%",
              "position": "relative",
              "bottom": "0",
              "left": "0",
              "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAACCAYAAABYBvyLAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAWElEQVQI1z2O1w0AIQxD2X8/aiiCDOLTQ+I+IsclVsJaS3tvmZlaa8o565xzd7QxhsiAKaU/23tXjFHurlrr1fADBLOUojnnLYKDcIoIv6H8edzCeYIc+AEw0o7A7mPYtwAAAABJRU5ErkJggg==')",
              "backgroundSize": "cover",
              "display": "block"
            }
          }}></span>{`
  `}<img parentName="span" {...{
            "className": "gatsby-resp-image-image",
            "alt": "IMAGE",
            "title": "IMAGE",
            "src": "/static/1a86528f25d208b19a99b3914ccc4ca0/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png",
            "srcSet": ["/static/1a86528f25d208b19a99b3914ccc4ca0/43fa5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 250w", "/static/1a86528f25d208b19a99b3914ccc4ca0/c6e3d/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 500w", "/static/1a86528f25d208b19a99b3914ccc4ca0/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 1000w", "/static/1a86528f25d208b19a99b3914ccc4ca0/2e9ed/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 1500w", "/static/1a86528f25d208b19a99b3914ccc4ca0/9fabd/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 2000w", "/static/1a86528f25d208b19a99b3914ccc4ca0/62145/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-2.png 2456w"],
            "sizes": "(max-width: 1000px) 100vw, 1000px",
            "style": {
              "width": "100%",
              "height": "100%",
              "margin": "0",
              "verticalAlign": "middle",
              "position": "absolute",
              "top": "0",
              "left": "0"
            },
            "loading": "lazy"
          }}></img>{`
    `}</span></p>
    </div>
    <p>{`Table (2) depicts the zero-shot voice conversion (ZSVC) results between
languages and genders by the speaker embeddings. For ZSVC, we pass the given
speech clip from the posterior encoder to compute the hidden representation and
re-run the model in the inference mode again conditioned on the target
speaker’s embedding. You see in the table the model’s performance between
languages and genders. For instance, “en-pt” shows the results for converting
the voice of a Portuguese speaker by conditioning on an English speaker.
And “M-F” offers the conversion of a Male speaker to a Female speaker.`}</p>
    <div align="center">
      <p><span parentName="p" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "1000px"
          }
        }}>{`
      `}<span parentName="span" {...{
            "className": "gatsby-resp-image-background-image",
            "style": {
              "paddingBottom": "36%",
              "position": "relative",
              "bottom": "0",
              "left": "0",
              "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAHCAYAAAAIy204AAAACXBIWXMAABYlAAAWJQFJUiTwAAABL0lEQVQoz1WRW6uCUBSE/f9/xycJIsKgJM3shqlZPlgJXbW8pMxhVigdYVjbkT17vq3yeDwQxzHm8zmOxyO22y2WyyVWq5XI8zw4joMgCGQuFgtMp1PsdjvYtg3TNOG6LlRVFV95vV4oigKceZ6jfd7vNy6XC+q67rz9fi/ixs1mg/V63QVqmvYNvF6veD6fuN1uuN/vyLJMRD9JEvnGdx7I9tR4PBYKhrXhvV4PhmFAITKD2smNFNudTicJbD3i+74vgcS1LAuj0Uia9vt96Lr+RSYqN3I2TSPimgcQmaJ3OBwQRZE0Y/hsNpPgMAwxHA7lPwgy23ES+xf5fD4jTdN/yAxiQ97bZDIRcU1kNlbKspQ2bdPP5yON2JiBVVV1Hv90i8w75GQgvcFgIM3/AKoi/q8AUjd8AAAAAElFTkSuQmCC')",
              "backgroundSize": "cover",
              "display": "block"
            }
          }}></span>{`
  `}<img parentName="span" {...{
            "className": "gatsby-resp-image-image",
            "alt": "IMAGE",
            "title": "IMAGE",
            "src": "/static/1cbc7cb409021c47eb620e97b3f6b898/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png",
            "srcSet": ["/static/1cbc7cb409021c47eb620e97b3f6b898/43fa5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png 250w", "/static/1cbc7cb409021c47eb620e97b3f6b898/c6e3d/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png 500w", "/static/1cbc7cb409021c47eb620e97b3f6b898/da8b6/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png 1000w", "/static/1cbc7cb409021c47eb620e97b3f6b898/2e9ed/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png 1500w", "/static/1cbc7cb409021c47eb620e97b3f6b898/30dc5/blog-tts-yourtts-zero-shot-text-synthesis-low-resource-languages-table-3.png 1962w"],
            "sizes": "(max-width: 1000px) 100vw, 1000px",
            "style": {
              "width": "100%",
              "height": "100%",
              "margin": "0",
              "verticalAlign": "middle",
              "position": "absolute",
              "top": "0",
              "left": "0"
            },
            "loading": "lazy"
          }}></img>{`
    `}</span></p>
    </div>
    <p>{`Table (3) yields the results for the speaker adaptation experiments where we
fine-tune the final YourTTS model by SCL on different length clips of a
particular novel speaker. For instance, the top row shows the results for a
model trained on a male English speaker with 61 seconds of an audio clip. GT is
the ground truth, ZS is zero-shot with only the speaker embeddings, and FT is
fine-tuning. These results show that our model can achieve high similarity
when fine-tuned with only 20 seconds of audio sample from a speaker in case
mere use of speaker embeddings is not enough to produce high-quality results.`}</p>
    <p>{`Due to the time and space constraints in the paper, we could not expand the
experiments to all the possible use-cases of YourTTS. We plan to include those
in our future study and add new capabilities to YourTTS that would give more
control over the model.`}</p>
    <h3 {...{
      "id": "try-out-yourtts",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#try-out-yourtts",
        "aria-label": "try out yourtts permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Try out YourTTS`}</h3>
    <p>{`Give YourTTS a try right on your browser using the demo on our `}<a parentName="p" {...{
        "href": "/"
      }}>{`homepage`}</a>{`.`}</p>
    <p>{`YourTTS is also available in `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`🐸TTS`}</a>{` with a
training recipe and a pre-trained model. You can train your own model,
synthesize voice with the pre-trained model or finetune it with your dataset.`}</p>
    <h3 {...{
      "id": "ethical-concerns",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#ethical-concerns",
        "aria-label": "ethical concerns permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Ethical Concerns`}</h3>
    <p>{`We are well aware that the expansion of the TTS technology enables various kinds
of malign uses of the technology. Therefore, we also actively study different
approaches to prevent or at the very least put more fences along the way of the
misuse of the TTS technology.`}</p>
    <p>{`To exemplify this, on our demo, we add background music to avert the unintended
use of the voice clips on different platforms.`}</p>
    <p>{`If you also want to contribute to our research and discussion in this field, join
us `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS/discussions/1036"
      }}>{`here`}</a>{`.`}</p>
    <h3 {...{
      "id": "conclusion",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#conclusion",
        "aria-label": "conclusion permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Conclusion`}</h3>
    <p>{`YourTTS can achieve competitive results on multi-lingual, multi-speaker TTS, and
zero-shot learning. It also allows cross-language voice transfer, learning new
speakers and languages from relatively more minor datasets than the traditional
TTS models.`}</p>
    <p>{`We are excited to present YourTTS and see all the different use-cases the 🐸
Community will apply it to. As always, feel free to reach out for any feedback.`}</p>


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      