import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/BlogTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p>{`👉 `}<a parentName="p" {...{
        "href": "https://huggingface.co/spaces/coqui/CoquiTTS"
      }}>{`Try out the new African TTS models!`}</a></p>
    <h3 {...{
      "id": "introduction",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#introduction",
        "aria-label": "introduction permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Introduction`}</h3>
    <p>{`Over the course of several months, researchers from Coqui collaborated with a global team of academics, language
activists, and technologists in order to create high quality Text-to-Speech for six African languages. This blog post
covers who this excellent team was, what we did, and how you can use these new voices for yourself. All the synthetic
voices discussed here are available under a Creative Commons BY-SA 4.0 License — a free, open, and commercial friendly
license.`}</p>
    <p>{`There are literally thousands of languages spoken in Africa, and as such this current work is only the tip of the
iceberg. Nevertheless, we hope our work inspires others to create new open, synthetic voices for as many of Africa’s
languages as possible. Coqui’s TTS can be fine-tuned to any new language, even with tiny amounts of data, regardless
of the alphabet or grammar or linguistic attributes. The more data the better, as you will see (and hear) here. Data
is almost always the bottleneck in deep learning, and in this blogpost we’ll discuss how we found raw data that wasn’t
ready for TTS, and massaged it into a place where beautiful, high-fidelity synthetic voices could be built. Once the
data was ready, training the models was a piece of cake.`}</p>
    <p>{`This project wouldn’t have been possible without our collaborators. Specifically, the excellent Masakhane NLP community
is what brought us all together in the first place. We eagerly look forward to more Coqui + Masakhane collaborations
in the future! If you want to see the future of natural language technology (especially for African languages),
`}<a parentName="p" {...{
        "href": "https://www.masakhane.io/"
      }}>{`Masakhane`}</a>{` is the place to be.`}</p>
    <h3 {...{
      "id": "collaborators",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#collaborators",
        "aria-label": "collaborators permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Collaborators`}</h3>
    <p>{`Without further ado, here’s the team of individuals that brought these voices into reality (in alphabetical order):`}</p>
    <ul>
      <li parentName="ul">{`Alp Öktem, Apelete Agbolo, Bernard Opoku, Chris Emezue, Colin Leong, Daniel Whitenack, David Ifeoluwa Adelani,
Edresson Casanova, Elizabeth Salesky, Iroro Orife, Jesujoba Alabi, Jonathan Mukiibi, Josh Meyer, Julian Weber, Perez
Ogayo, Salomey Osei, Salomon Kabongo, Samuel Olanrewaju, Shamsuddeen Muhammad, Victor Akinode`}</li>
    </ul>
    <p>{`Featuring activists and technologists from:`}</p>
    <ul>
      <li parentName="ul">{`CLEAR Global, Col·lectivaT, Ewegbe Akademi, Masakhane, Niger-Volta LTI, SIL International`}</li>
    </ul>
    <p>{`Featuring academic researchers from:`}</p>
    <ul>
      <li parentName="ul">{`Carnegie Mellon University, Johns Hopkins University, Kwame Nkrumah University of Science and Technology, Leibniz
Universität, Makerere University, Saarland University, Technical University of Munich, University of São Paulo`}</li>
    </ul>
    <h3 {...{
      "id": "the-languages",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#the-languages",
        "aria-label": "the languages permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`The Languages`}</h3>
    <p>{`The six new languages added to TTS are:`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Language`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Classification`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`African region`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Number of Speakers`}</th>
          <th parentName="tr" {...{
            "align": null
          }}></th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Ewe`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Niger-Congo / Kwa`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`West`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`5.5M`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Hausa`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Afro-Asiatic / Chadic`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`West`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`77M`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Lingala`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Niger-Congo / Bantu`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Central`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`40M`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Akuapem Twi`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Niger-Congo / Akan`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`West`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`626k`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Asante Twi`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Niger-Congo / Akan`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`West`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`3.8M`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Yoruba`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Niger-Congo / Volta-Niger`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`West`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`46M`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
        </tr>
      </tbody>
    </table>
    <p>{`Both the “Number of Speakers” and “Classification” columns come from Ethnologue. These six languages are all
`}<a parentName="p" {...{
        "href": "https://en.wikipedia.org/wiki/Tone_(linguistics)"
      }}>{`tonal`}</a>{`, come from two of the largest language families in Africa
(Niger-Congo and Afro-Asiatic), and are spoken primarily in Central and West Africa. Needless to say, there are a lot
of people speaking these languages in a huge geographic area. By releasing these models under an open Creative Commons
license, we hope they will be immediately useful to speakers of these languages.`}</p>
    <h3 {...{
      "id": "the-collaboration-story",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#the-collaboration-story",
        "aria-label": "the collaboration story permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`The Collaboration Story`}</h3>
    <p>{`As with all machine learning projects, data is the starting point. This entire collaboration spawned from a short URL
posted into a chatroom: `}<a parentName="p" {...{
        "href": "https://open.bible"
      }}>{`open.bible`}</a>{`. A researcher from Coqui was hanging out with the folks from
Masakhane in their slack group when someone posted the link saying something like “looks like some cool data!“. In no
time at all, a lively discussion ensued. The data was absolutely beautiful. All the data was explicitly licensed under
CC-BY-SA, made of hours and hours of high-quality recordings from professional voice actors. This was without
exaggeration the highest-quality voice data for speech synthesis Coqui had ever found in the open — for any language.`}</p>
    <p>{`There was only one problem — the original audio files are too long for training TTS models. The audio was saved as
chapters (from the Bible), which were several minutes long each. It’s best to train synthetic voices with audio clips
under 30 seconds long, so we couldn’t use the data out of the box. The intuitively simple task of breaking chapters
into verses is not so simple in practice, and it requires significant compute power. Nevertheless, over a couple months
and more than a couple cups of coffee, we aligned the recordings to the verse-level, and then we extracted only the
best data. The resulting datasets will be released under the same CC-BY-SA 4.0 license, as well as our research paper
detailing how we made it possible. Both the dataset release and publication of our methods are slated for INTERSPEECH 2022.`}</p>
    <h3 {...{
      "id": "use-the-models",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#use-the-models",
        "aria-label": "use the models permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Use the Models`}</h3>
    <p>{`All models discussed here can be used from:`}</p>
    <ol>
      <li parentName="ol">{`Our official `}<a parentName="li" {...{
          "href": "https://huggingface.co/spaces/coqui/CoquiTTS"
        }}>{`Coqui Huggingface space`}</a></li>
      <li parentName="ol">{`Your browser with `}<a parentName="li" {...{
          "href": "https://tts.readthedocs.io/en/latest/inference.html#on-the-demo-server-tts-server"
        }}><inlineCode parentName="a">{`tts-server`}</inlineCode></a></li>
      <li parentName="ol">{`Your command line with `}<a parentName="li" {...{
          "href": "https://tts.readthedocs.io/en/latest/inference.html#on-the-commandline-tts"
        }}><inlineCode parentName="a">{`tts`}</inlineCode></a></li>
    </ol>
    <h3 {...{
      "id": "conclusion",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#conclusion",
        "aria-label": "conclusion permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Conclusion`}</h3>
    <p>{`Keep an eye our for our INTERSPEECH paper for all the technical details on how we created the datasets and trained the
models. Until then, take the models and do something great! We want to thank again all the folks who helped make this
possible, especially the Masakhane community for bringing us all together. We want to also acknowledge the individuals
who spent hours and hours narrarating the Bible in these languages for the Open.Bible project, and for releasing the
recordings under a Creative Commons license in the first place. We also want to thank the team of folks at Biblica for
taking such care to record, organize, and release the raw data. They did not participate in this research, but it would
not have been possible without them. On a last, but important note, anyone using these synthetic voices should be using
them to create more good in the world, and no more harm. Out of respect to the original voice actors and the nature of
the original recordings, we want the voices to be used to help people, and we’re sure they can find great use in places
like education and accessibility. For example, these voices can easily be used to create audiobooks for people who can’t
see or read well, and make reading more fun for students. If you use these models, let us know what great things you’re
creating in the world!`}</p>


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      