import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/NewsletterTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "934px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "27.599999999999998%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAGCAYAAADDl76dAAAACXBIWXMAAA3XAAAN1wFCKJt4AAABCElEQVQY042RQUsCcRDF/VB9g+59gk6eOhTRpaBbh5CgYCuiTkYGah6CSIwiyiRbiiyVFg+5EoS6LO1/XS0oXffXriC1atCDOcyDeW/mTQAXjuMwhFHcH/g9H/CaPqG/W5T0V1ShUTbqjDIb7L/abR/vE8y8KEjyMfv5NHuFS87kO0LhCBvxBOuxBPHTc0K7EVJZGV2YhI+SZAvF3qzd7f4I9h287WpNgfho8WzWWdjaZnFzh+DSMtMra0zMzTO7KjE2GewJj0/NcOKKe+jY9vCGWsvkvlom55ZqamhvBqnrG26fFDIPeYTVRIoecHiR5ir3iKJWMBqW/+TBbDynz46by/9/4sv1G2shujMHTg19AAAAAElFTkSuQmCC')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/c9103bc33c8add5ab1fa4fa1c49c90ef/ca463/logo-wordmark.png",
          "srcSet": ["/static/c9103bc33c8add5ab1fa4fa1c49c90ef/43fa5/logo-wordmark.png 250w", "/static/c9103bc33c8add5ab1fa4fa1c49c90ef/c6e3d/logo-wordmark.png 500w", "/static/c9103bc33c8add5ab1fa4fa1c49c90ef/ca463/logo-wordmark.png 934w"],
          "sizes": "(max-width: 934px) 100vw, 934px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <h3 {...{
      "id": "welcome",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#welcome",
        "aria-label": "welcome permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Welcome`}</h3>
    <p>{`By `}<a parentName="p" {...{
        "href": "https://github.com/kdavis-coqui"
      }}>{`Kelly Davis`}</a></p>
    <p>{`Years ago now, I remember starting work on the speech recognition engine that would become the core of
`}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT"
      }}>{`Coqui 🐸 STT`}</a>{`. Then it was only a dream, creating open source speech
technology that brought research into the hands of the enterprise and regular developers. But here we
are, years later, and Coqui is a reality.`}</p>
    <p>{`We’ve grown a lot since then. `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT"
      }}>{`Coqui 🐸 STT`}</a>{` has gone from aspiration
to a actuality, powering the enterprise and providing speech technology to numerous low-resource languages.
Also, `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS"
      }}>{`Coqui 🐸 TTS`}</a>{`, through that same dream, was born, bringing open,
`}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/TTS#-tts-performance"
      }}>{`human-quality`}</a>{` speech synthesis to regular developers
and the enterprise.`}</p>
    <p>{`I am grateful that you are joining us on this journey, and I want to personally thank everyone who has ever subscribed
to our newsletter, used our software, downloaded a model, filed a bug report, joined our
`}<a parentName="p" {...{
        "href": "https://gitter.im/coqui-ai"
      }}>{`discussion forms`}</a>{`, or simply given us a ⭐ star on GitHub. Thank you!`}</p>
    <p>{`Stay tuned; there’s much more to come! 🐸`}</p>
    <p>{`On to our first, monthly newsletter where you’ll hear what we are up to and general reflections on speech tech!`}</p>
    <h3 {...{
      "id": "coqui-stt-playbook",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#coqui-stt-playbook",
        "aria-label": "coqui stt playbook permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Coqui STT Playbook`}</h3>
    <p>{`By `}<a parentName="p" {...{
        "href": "https://github.com/reuben"
      }}>{`Reuben Morais`}</a></p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "1000px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "61.6%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAMCAIAAADtbgqsAAAACXBIWXMAAAsTAAALEwEAmpwYAAACt0lEQVQozw3SSVPaYACA4fygHnpsp1VLW6oioLIoWyAYSNgCX8AEvrCqkcgmYGQHCagglK2ijradTg/9YfX6Ht95EK3qk1Yt06tlWr0qmj6NpiAVZ0IpmBVLJ8KJcHFekq6EJJ0Ti8l8LhSmcQ9h2V2zqt9plR+RvW2Zzaj0WLcJ536hWr6sFqu9dutGKjcr9kPSEweNQVeajMtpNpsKFk5hLOwDhMGDKnGTEnGg2wHSGHDowyzVHQ2kQb/RFGvtWu4yH+SZEM9k6oXvj8t2NZMV4nySiUWDEQqjCUPIqUOcFjXP+Y9YFweB1BFnD4v58+NoMribDwez4fhhMb6fjn5MbxdTsXcl9RoXlWLYb48E7IA0Ib4DXcRvqwgQ0LgjiFV71UKrxOWSvCjEiqmq1Fw8L7uzu+7k7rLbvJmOa1I7cUgkGRdwWxDaZY4H8WLCTwcdvpTPHTvAoeOoJJCsg4B4RGDrt52LXmO4nBZq5c7wui51jhkiDjDGqUE4jyFMaE9DdobxNvotQcw0Rv2T8vG+d9cZsXN52Ok3pEF3/vJY79SG9/OG1C5AIh3CWHIPifnRgH2Ho6zRKB3OHqGUhggbUbCLApXWtQGOwOz5fvn75de/v9eT0fhp2epfHR/icfrg9TGCW9Q2gwIGsAjAYwnayWBmn0JDrO845ArbZxdHPP35eXM/ZcWzeCY6fny4aFUjfixGWXxWFfKKxKD5mmDdfkAGAFbp1/qLG3gCCIjtEAq9Rx8vJH0plyVkPKucT16ez8SSj7QAL3Zg0yMW3bpJK2fdJpNx643srVy/wvEwW8xDHprc+190coVZricUXs4TTfMwzVOHlNuqCnnRWMSPJAAWpdA8JBhyb3dzxapbd6Mqt1WN7m/Y9jZe8akUq+qtNaVCtvntw9a39xrl6mv0kmaX0/wfDtFUUQbKgn0AAAAASUVORK5CYII=')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/13ebb059798b7e17243411a28f2fb044/da8b6/coqui-stt-playbook.png",
          "srcSet": ["/static/13ebb059798b7e17243411a28f2fb044/43fa5/coqui-stt-playbook.png 250w", "/static/13ebb059798b7e17243411a28f2fb044/c6e3d/coqui-stt-playbook.png 500w", "/static/13ebb059798b7e17243411a28f2fb044/da8b6/coqui-stt-playbook.png 1000w", "/static/13ebb059798b7e17243411a28f2fb044/2e9ed/coqui-stt-playbook.png 1500w", "/static/13ebb059798b7e17243411a28f2fb044/9fabd/coqui-stt-playbook.png 2000w"],
          "sizes": "(max-width: 1000px) 100vw, 1000px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`Getting started with speech-to-text can be somewhat intimidating. There are new terms to learn, tools and
pipelines to get acquainted with, and a whole lot of excitement about the speech experiences you can build.
At Coqui, we want developers to be able to focus entirely on that last part: let their creativity run wild
so they can build applications, systems, and experiences using speech that will redefine how users interact
with technology in the future.`}</p>
    <p>{`With that in mind, Coqui co-founder `}<a parentName="p" {...{
        "href": "https://mobile.twitter.com/_josh_meyer_"
      }}>{`Josh Meyer`}</a>{` started a “playbook”:
a complement to our technical documentation that told a cohesive story from start to end, covering the entire
process of getting familiar with Coqui 🐸 STT (and STT in general), defining all the pieces of the puzzle and
how they fit together, and then walking you through the process of collecting and refining your data,
training models and getting confident in how they work, and finally deploying your model on your platform
and in your programming language of choice.`}</p>
    <p>{`This idea resonated with the community, and things really kicked into high gear when `}<a parentName="p" {...{
        "href": "https://twitter.com/KathyReid"
      }}>{`Kathy Reid`}</a>{`
joined forces in the creation of the playbook. The result is an opinionated guide that is meant as a smooth
onboarding process for getting familiar with STT, getting developers from zero to a working speech-to-text
system, and giving them the knowledge and confidence to start tweaking with their processes and finding the
best tailored way to make speech work for them.`}</p>
    <p>{`At Coqui, we want to take this idea to the next level: we want developers to be able to build speech experiences
effortlessly, as well as to easily share their work with the speech community - be it tools, models, new
architectures, or datasets. As a first step, we have released the updated `}<a parentName="p" {...{
        "href": "https://stt.readthedocs.io/en/latest/playbook/README.html"
      }}>{`STT playbook`}</a>{`,
readable now on our main documentation site. We will continue to refine our documentation, tools, and models
working together with our brilliant community to bring speech research into reality.`}</p>
    <p>{`Don’t forget to check out the updated `}<a parentName="p" {...{
        "href": "https://stt.readthedocs.io/en/latest/playbook/README.html"
      }}>{`STT playbook`}</a>{`
and `}<a parentName="p" {...{
        "href": "https://gitter.im/coqui-ai"
      }}>{`join our growing speech community`}</a>{`.`}</p>
    <h3 {...{
      "id": "sc-glowtts-an-efficient-zero-shot-multi-speaker-tts-model",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#sc-glowtts-an-efficient-zero-shot-multi-speaker-tts-model",
        "aria-label": "sc glowtts an efficient zero shot multi speaker tts model permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`SC-GlowTTS: An Efficient Zero-Shot Multi-Speaker TTS Model`}</h3>
    <p>{`By `}<a parentName="p" {...{
        "href": "https://github.com/erogol"
      }}>{`Eren Gölge`}</a></p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "743px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "60.4%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAMCAYAAABiDJ37AAAACXBIWXMAAAsTAAALEwEAmpwYAAABgUlEQVQoz5WSwW7TQBRF+xks+RE+gj2/wIIN/QV2LFBXVKoEG8QWCYRS1KobiFqqNlIINCq0ceoQO47t2jPjsWd8eIpYVakUz/rqzLvnvS3WvLZt2eSty23dF2qcJ44ixr9GBJMJVWXpfTvn9ft99t5+YhYGa6H3Ao0A8smAP997FLMxjfc8ebHLg8fbPHz0lIODk1XOyccbAa21RJenXA+OyIORjFzzrnfEs1c7PH+5w8XwapXz3m8+Yfz7jPHJPmnwkzpXZHnE5+OPHA6+oGvVrbKuHDc3Uy5GQ5I0l26ecHrFYb/P19MztDLdgFnZkOYJTZGQ3SpUqUmXAfN4RJZKXa1ouwBL06CNwVVGIAlFqWTqhnhZiA4nTswKyMZbto75ImUazim0pRZGlpfM5tFKQV10ctjKHbYyYU28WFKKL20sy6wgDP+iBOiU7rgUHYmzH1yHQz7033B+eSy15ZSiBaU4bRrXDehcja3El1UktxGlLrC1R5kKI/39/9xd4D+GIpv/qdfXDwAAAABJRU5ErkJggg==')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/3a26d3b6eb8dcc0c81662bc544ecc10f/b217e/sc-glowtts.png",
          "srcSet": ["/static/3a26d3b6eb8dcc0c81662bc544ecc10f/43fa5/sc-glowtts.png 250w", "/static/3a26d3b6eb8dcc0c81662bc544ecc10f/c6e3d/sc-glowtts.png 500w", "/static/3a26d3b6eb8dcc0c81662bc544ecc10f/b217e/sc-glowtts.png 743w"],
          "sizes": "(max-width: 743px) 100vw, 743px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`At Coqui, we’re motivated to provide speech technology for all languages and people. One of the problems we’ve
encountered along way is data-hungry machine learning models. For some languages the data simply isn’t there!
Finding enough data is hard, and even if it’s available, training machine learning models is difficult too.`}</p>
    <p>{`To solve a part of this problem for text-to-speech tasks, we investigated different `}<a parentName="p" {...{
        "href": "https://en.wikipedia.org/wiki/Zero-shot_learning"
      }}>{`zero-shot learning`}</a>{`
approaches using state-of-the-art text-to-speech models. (Zero-shot learning techniquies can greatly reduce
data requirements for some algorithms.) This investigation bore fruit! We came up with a new algorithm we
christened “SC-GlowTTS”, a catchy name I know. SC-GlowTTS can generalize to novel speakers after training
with only 11 speakers for the target language. This means we need less data!`}</p>
    <p>{`Soon after this newsletter finds its way into your hands, we’ll release SC-GlowTTS’s code, models, and an
associated article. Please stay tuned! But, if you can’t wait, check out the SC-GlowTTS
`}<a parentName="p" {...{
        "href": "https://edresson.github.io/SC-GlowTTS/"
      }}>{`project page`}</a>{`.`}</p>
    <p>{`This is a work of all the Coqui 🐸 TTS community but special thanks to the main author
`}<a parentName="p" {...{
        "href": "https://github.com/Edresson"
      }}>{`Edresson Casanova`}</a>{` who organized and did the brunt of the work and
`}<a parentName="p" {...{
        "href": "https://github.com/mueller91"
      }}>{`Nicolas Michael Müller`}</a>{` for training the most expansive open
source speaker encoder, which we used in this work.`}</p>
    <p>{`It is great to see how an open community of great developers and researchers can innovate without borders.
We hope this is just a start for us and our great community to pave the way for open source TTS.`}</p>
    <h3 {...{
      "id": "few-shot-keyword-spotting-in-any-language",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#few-shot-keyword-spotting-in-any-language",
        "aria-label": "few shot keyword spotting in any language permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Few-Shot Keyword Spotting in Any Language`}</h3>
    <p>{`By `}<a parentName="p" {...{
        "href": "https://github.com/JRMeyer"
      }}>{`Josh Meyer`}</a></p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "980px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "34.800000000000004%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAHCAYAAAAIy204AAAACXBIWXMAAAsTAAALEwEAmpwYAAABkElEQVQozy2RW4/SQBiG+//vvTXGRBM1MXpldl3NaswKrBpZMbstZaGFaQeYAoWeT4/TwTeZZDLPfO93spqmIctKAvsPqx+f8b59Rf616dUz5/Ytdx/e4X2ZIG89EhEbVqQpL1+85+nlBR/dCT/VknV+xKqbmkjFLIZXhDeXzK/HyPvgHJQnjK+eY1/8Qo0jUpFSJ7VhR7XjybM3vLofMYlDRH4gbyusY5IRuC6bwSeCO0UkCqq8pWlByUec6xuSJf/VmdN2HUt3wevRdxSFIcJfkiYJVpzkhJPfrEY2h6ik1VlMu9pwI2Yshw+0uqiqKgiE0Lw1pjNtOJw9mL+1Hs3jYk6e51hZUeENBqj5Vps0TJ2pydRrL1cE44W5l0WBY9sUxbkiz1/hhefRtG2H4zgkpxNWmuXIqcvpcNTLyZBhyEmDXnEUsVspqrom0UlCzeL4vBSlFPvDgbIqydKMzXpt4q1af+6BCAS+7xPKULd3bruf1W6/x/M8XZHHdrul6zrD+uDeVOgxBEGAlNK8/QO3dQwIWWFougAAAABJRU5ErkJggg==')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/2c3d0d168ad5f9315b408b2df295551f/2b72d/few-shot-keyword-spotting-in-any-language.png",
          "srcSet": ["/static/2c3d0d168ad5f9315b408b2df295551f/43fa5/few-shot-keyword-spotting-in-any-language.png 250w", "/static/2c3d0d168ad5f9315b408b2df295551f/c6e3d/few-shot-keyword-spotting-in-any-language.png 500w", "/static/2c3d0d168ad5f9315b408b2df295551f/2b72d/few-shot-keyword-spotting-in-any-language.png 980w"],
          "sizes": "(max-width: 980px) 100vw, 980px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`We have traditionally focused on open vocabulary speech-to-text at Coqui, but now we are expanding to new
horizons.`}</p>
    <p>{`The flexibility of open vocabulary STT is great: you can say literally anything to a
`}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT"
      }}>{`Coqui 🐸 STT`}</a>{` model, and it will transcribe it. However, there are many
applications where you don’t need to transcribe every word, you just need to spot a few keywords in the
stream of audio. One common application of this is wake word detection, e.g. ”`}<a parentName="p" {...{
        "href": "https://www.youtube.com/watch?v=1ZXugicgn6U"
      }}>{`Hey, Computer`}</a>{`”.
For the task of wake word detection, deploying a large, open vocabulary STT model is impractical on edge devices.
(Edge devices have very small compute resources and assume limited power consumption.) As such, we’ve begun
research into robust, multilingual, practical model architectures for keyword spotting.`}</p>
    <p>{`Collaborating with researchers from `}<a parentName="p" {...{
        "href": "https://edge.seas.harvard.edu/"
      }}>{`Harvard University`}</a>{` and
`}<a parentName="p" {...{
        "href": "https://www.tensorflow.org/lite"
      }}>{`Google`}</a>{`, Coqui co-authored a publication on an
extremely efficient keyword spotting technique that scales to any language. The training technique is
intuitive and it works: we first train a base model to classify lots of keywords in lots of different
languages. This is a classification model, not a transcription model. This base model does really well
for the keywords on which it was trained, but we really care about how well this model can perform on
new keywords that it’s never heard in new languages. The answer: it does great! We can fine-tune this
base model to any new keyword in any language with just five audio clips of the new word.`}</p>
    <p>{`We submitted this work to the `}<a parentName="p" {...{
        "href": "https://www.interspeech2021.org/"
      }}>{`INTERSPEECH 2021`}</a>{` conference. Wish us luck
on acceptance! Until INTERSPEECH 2021, let us whet your appetite with this teaser, the abstract for the
paper:`}</p>
    <p><strong parentName="p">{`Abstract:`}</strong>{`
We introduce a few-shot transfer learning method for key-word spotting in any language. Leveraging open
speech corpora in nine languages, we automate the extraction of a large multilingual keyword bank and
use it to train an embedding model. With just five training examples, we fine-tune the embedding model
for keyword spotting and achieve an average F1 score of 0.75 on keyword classification for 180 new
keywords unseen by the embedding model in these nine languages. This embedding model also generalizes
to new languages. We achieve an average F1 score of 0.65 on 5-shot models for 260 keywords sampled
across 13 new languages unseen by the embedding model. We investigate streaming accuracy for our 5-shot
models in two contexts: keyword spotting and keyword search. Across 440 keywords in 22 languages,
we achieve an average streaming keyword spotting accuracy of 85.2% with a false acceptance rate of
1.2%, and observe promising initial results on keyword search.`}</p>
    <h3 {...{
      "id": "new-release--tts-v0011",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#new-release--tts-v0011",
        "aria-label": "new release  tts v0011 permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`New Release: 🐸 TTS v0.0.11`}</h3>
    <p>{`By `}<a parentName="p" {...{
        "href": "https://github.com/erogol"
      }}>{`Eren Gölge`}</a></p>
    <p>{`Oh, one more thing!`}</p>
    <p>{`We are happy to release two new German 🐸 TTS models trained and shared by the great `}<a parentName="p" {...{
        "href": "https://twitter.com/ThorstenVoice"
      }}>{`ThorstenVoice`}</a>{`.`}</p>
    <p>{`You can see a list of all the released models and you can start using them with the simple command line calls:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-terminal"
      }}>{`> pip install -U tts
> tts --list_models
> tts --text "Coqui TTS is great!" --out_path path/to/save/output.wav
> tts --model_name tts_models/de/thorsten/tacotron2-DCA --text "Coqui TTS ist bereit, Deutsch zu sprechen." --out_path output.wav
`}</code></pre>
    {
      /* markdownlint-enable line-length */
    }


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      