import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/coqui-ai.github.io/coqui-ai.github.io/src/templates/BlogTemplate.tsx";
import { graphql } from 'gatsby';
export const pageQuery = graphql`
  query($fileAbsolutePath: String) {
    ...SidebarPageFragment
  }
`;
export const _frontmatter = {};
const layoutProps = {
  pageQuery,
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <p>{`The Machine Learning team at continues to work on an automatic speech
recognition engine as part of `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT"
      }}>{`Coqui STT`}</a>{`,
which aims to make speech technologies and trained models openly available to
developers. We’re hard at work improving performance and ease-of-use for our
open source speech-to-text engine. The upcoming 0.2 release will include a
much-requested feature: the ability to do speech recognition live, as the audio
is being recorded. This blog post describes how we changed the STT engine’s
architecture to allow for this, achieving real-time transcription performance.
Soon, you’ll be able to transcribe audio at least as fast as it’s coming in.`}</p>
    <p>{`When applying neural networks to sequential data like audio or text, it’s
important to capture patterns that emerge over time. Recurrent neural networks
(RNNs) are neural networks that “remember” — they take as input not just the
next element in the data, but also a state that evolves over time, and use this
state to capture time-dependent patterns. Sometimes, you may want to capture
patterns that depend on future data as well. One of the ways to solve this is
by using two RNNs, one that goes forward in time and one that goes backward,
starting from the last element in the data and going to the first element. You
can learn more about RNNs (and about the specific type of RNN used in Coqui
STT) in `}<a parentName="p" {...{
        "href": "https://colah.github.io/posts/2015-08-Understanding-LSTMs/"
      }}>{`this article by Chris
Olah`}</a>{`.`}</p>
    <h3 {...{
      "id": "using-a-bidirectional-rnn",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#using-a-bidirectional-rnn",
        "aria-label": "using a bidirectional rnn permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Using a bidirectional RNN`}</h3>
    <p>{`The current release of Coqui STT (`}<a parentName="p" {...{
        "href": "/blog/stt/a-journey-to-10-word-error-rate"
      }}>{`previously
covered`}</a>{`) uses a bidirectional RNN
implemented with `}<a parentName="p" {...{
        "href": "https://www.tensorflow.org/"
      }}>{`TensorFlow`}</a>{`, which means it
needs to have the entire input available before it can begin to do any useful
work. One way to improve this situation is by implementing a streaming model:
Do the work in chunks, as the data is arriving, so when the end of the input is
reached, the model is already working on it and can give you results more
quickly. You could also try to look at partial results midway through the
input.`}</p>
    <p><img parentName="p" {...{
        "src": "/static/blog-stt-a-journey-to-10-word-error-rate-architecture-1dd53cecf9b0f6023f3d4ed8eac5fa25.gif",
        "alt": "IMAGE"
      }}></img></p>
    <p>{`This animation shows how the data flows through the network. Data flows from
the audio input to feature computation, through three fully connected layers.
Then it goes through a bidirectional RNN layer, and finally through a final
fully connected layer, where a prediction is made for a single time step.`}</p>
    <p>{`In order to do this, you need to have a model that lets you do the work in
chunks. Here’s the diagram of the current model, showing how data flows through
it.`}</p>
    <p>{`As you can see, on the bidirectional RNN layer, the data for the very last step
is required for the computation of the second-to-last step, which is required
for the computation of the third-to-last step, and so on. These are the red
arrows in the diagram that go from right to left.`}</p>
    <p>{`We could implement partial streaming in this model by doing the computation up
to layer three as the data is fed in. The problem with this approach is that it
wouldn’t gain us much in terms of latency: Layers four and five are responsible
for almost half of the computational cost of the model.`}</p>
    <h3 {...{
      "id": "using-a-unidirectional-rnn-for-streaming",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#using-a-unidirectional-rnn-for-streaming",
        "aria-label": "using a unidirectional rnn for streaming permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Using a unidirectional RNN for streaming`}</h3>
    <p>{`Instead, we can replace the bidirectional layer with a unidirectional layer,
which does not have a dependency on future time steps. That lets us do the
computation all the way to the final layer as soon as we have enough audio
input.`}</p>
    <p>{`With a unidirectional model, instead of feeding the entire input in at once and
getting the entire output, you can feed the input piecewise. Meaning, you can
input 100ms of audio at a time, get those outputs right away, and save the
final state so you can use it as the initial state for the next 100ms of audio.`}</p>
    <p><span parentName="p" {...{
        "className": "gatsby-resp-image-wrapper",
        "style": {
          "position": "relative",
          "display": "block",
          "marginLeft": "auto",
          "marginRight": "auto",
          "maxWidth": "625px"
        }
      }}>{`
      `}<span parentName="span" {...{
          "className": "gatsby-resp-image-background-image",
          "style": {
            "paddingBottom": "100%",
            "position": "relative",
            "bottom": "0",
            "left": "0",
            "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAAACXBIWXMAAAsTAAALEwEAmpwYAAACMUlEQVQ4y3VUi5KaQBC8//+tJJUYEu8RH2XpaRSBBXktLOyDRdIwHqcpnaKsmd3t6d7eKZ+6ruOcG2O6Ic7nM+XWWuRt22qtUSJB2QzRfcTTuDGCi6JAopTCOo6WZYkSHZFjsa7rG/AIwy/25vO5lBKEWEmSZLFYkBxocV13t9uNhz/BFDhxiiLsERhdTqcT1kkwROV5foeZApgsy8CDBI0gBKehHzk0CyEKzlFewOerQB1F0XQ69bxIiFYI+/6+f3l5juNCyi5N6+VyNZv9IQt7MPzwfT8MQ+qHX/QeDYOED8Ma5ErJG8NI0qgER1gQ0AtBC46i73hnXCFNkoduw6HlcqmHwEqapqvVip4KHJ7n7ff7h27373wl2zaNuHpnPeh+yFxV1Ww2A5JcgX8QMjIfDoftdvuQGWz0sMD3r60UXZKY8VR80HUfDAzGiCaRRtU7Hskw4OM4Jv/uywZms9lAIcmGvaSTZIeMUa87zNAMt3meowXphAUgRyPagnmIzyH5zy3M/WQyWa/XQOJ6cMtxHPiEXXjx9vo6fX5mjBHkhlnKfpJxMYy3VLISIstSlJznWitwpmmCuh9vay/MF9q6LkWlG4sP94tOcZbzxramsco0QRiWVWVsq/vSZLzAH0UPjuPEGu2mtROYX0w7zDi++vaXf92XDsrA/PDkl20+cSts/fTVb9Z8d4U2ZvgnGZjLWru5YoXG5xf6kIhjVgWF7j+uDkl5zCS7lDrMa2nsPydJgpnM53nmAAAAAElFTkSuQmCC')",
            "backgroundSize": "cover",
            "display": "block"
          }
        }}></span>{`
  `}<img parentName="span" {...{
          "className": "gatsby-resp-image-image",
          "alt": "IMAGE",
          "title": "IMAGE",
          "src": "/static/896790eb2e0e662652e5d0ca3d2a9c32/25b0e/blog-stt-speech-recognition-deepspeech-unidirectional.png",
          "srcSet": ["/static/896790eb2e0e662652e5d0ca3d2a9c32/43fa5/blog-stt-speech-recognition-deepspeech-unidirectional.png 250w", "/static/896790eb2e0e662652e5d0ca3d2a9c32/c6e3d/blog-stt-speech-recognition-deepspeech-unidirectional.png 500w", "/static/896790eb2e0e662652e5d0ca3d2a9c32/25b0e/blog-stt-speech-recognition-deepspeech-unidirectional.png 625w"],
          "sizes": "(max-width: 625px) 100vw, 625px",
          "style": {
            "width": "100%",
            "height": "100%",
            "margin": "0",
            "verticalAlign": "middle",
            "position": "absolute",
            "top": "0",
            "left": "0"
          },
          "loading": "lazy"
        }}></img>{`
    `}</span></p>
    <p>{`An alternative architecture that uses a unidirectional RNN in which each time
step only depends on the input at that time and the state from the previous
step.`}</p>
    <p>{`Here’s code for creating an inference graph that can keep track of the state
between each input window:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`import tensorflow as tf

def create_inference_graph(batch_size=1, n_steps=16, n_features=26, width=64):
    input_ph = tf.placeholder(dtype=tf.float32,
                              shape=[batch_size, n_steps, n_features],
                              name='input')
    sequence_lengths = tf.placeholder(dtype=tf.int32,
                                      shape=[batch_size],
                                      name='input_lengths')
    previous_state_c = tf.get_variable(dtype=tf.float32,
                                       shape=[batch_size, width],
                                       name='previous_state_c')
    previous_state_h = tf.get_variable(dtype=tf.float32,
                                       shape=[batch_size, width],
                                       name='previous_state_h')
    previous_state = tf.contrib.rnn.LSTMStateTuple(previous_state_c, previous_state_h)

    # Transpose from batch major to time major
    input_ = tf.transpose(input_ph, [1, 0, 2])

    # Flatten time and batch dimensions for feed forward layers
    input_ = tf.reshape(input_, [batch_size*n_steps, n_features])

    # Three ReLU hidden layers
    layer1 = tf.contrib.layers.fully_connected(input_, width)
    layer2 = tf.contrib.layers.fully_connected(layer1, width)
    layer3 = tf.contrib.layers.fully_connected(layer2, width)

    # Unidirectional LSTM
    rnn_cell = tf.contrib.rnn.LSTMBlockFusedCell(width)
    rnn, new_state = rnn_cell(layer3, initial_state=previous_state)
    new_state_c, new_state_h = new_state

    # Final hidden layer
    layer5 = tf.contrib.layers.fully_connected(rnn, width)

    # Output layer
    output = tf.contrib.layers.fully_connected(layer5, ALPHABET_SIZE+1, activation_fn=None)

    # Automatically update previous state with new state
    state_update_ops = [
        tf.assign(previous_state_c, new_state_c),
        tf.assign(previous_state_h, new_state_h)
    ]
    with tf.control_dependencies(state_update_ops):
        logits = tf.identity(logits, name='logits')

    # Create state initialization operations
    zero_state = tf.zeros([batch_size, n_cell_dim], tf.float32)
    initialize_c = tf.assign(previous_state_c, zero_state)
    initialize_h = tf.assign(previous_state_h, zero_state)
    initialize_state = tf.group(initialize_c, initialize_h, name='initialize_state')

    return {
        'inputs': {
            'input': input_ph,
            'input_lengths': sequence_lengths,
        },
        'outputs': {
            'output': logits,
            'initialize_state': initialize_state,
        }
    }
`}</code></pre>
    <p>{`The graph created by the code above has two inputs and two outputs. The inputs
are the sequences and their lengths. The outputs are the logits and a special
`}<inlineCode parentName="p">{`initialize_state`}</inlineCode>{` node that needs to be run at the beginning of a new
sequence. When freezing the graph, make sure you don’t freeze the state
variables `}<inlineCode parentName="p">{`previous_state_h`}</inlineCode>{` and `}<inlineCode parentName="p">{`previous_state_c`}</inlineCode>{`.`}</p>
    <p>{`Here’s code for freezing the graph:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`from tensorflow.python.tools import freeze_graph

freeze_graph.freeze_graph_with_def_protos(
        input_graph_def=session.graph_def,
        input_saver_def=saver.as_saver_def(),
        input_checkpoint=checkpoint_path,
        output_node_names='logits,initialize_state',
        restore_op_name=None,
        filename_tensor_name=None,
        output_graph=output_graph_path,
        initializer_nodes='',
        variable_names_blacklist='previous_state_c,previous_state_h')
`}</code></pre>
    <p>{`With these changes to the model, we can use the following approach on the client side:`}</p>
    <ol>
      <li parentName="ol">{`Run the `}<inlineCode parentName="li">{`initialize_state`}</inlineCode>{` node.`}</li>
      <li parentName="ol">{`Accumulate audio samples until there’s enough data to feed to the model (16 time steps in our case, or 320ms).`}</li>
      <li parentName="ol">{`Feed through the model, accumulate outputs somewhere.`}</li>
      <li parentName="ol">{`Repeat 2 and 3 until data is over.`}</li>
    </ol>
    <p>{`It wouldn’t make sense to drown readers with hundreds of lines of the
client-side code here, but if you’re interested, it’s all MPL 2.0 licensed and
available on `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT"
      }}>{`GitHub`}</a>{`. We actually have two
different implementations, `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT/blob/bb299dc26554b2fbf864b7f0115b4baece15bda5/evaluate.py#L233"
      }}>{`one in
Python`}</a>{`
that we use for generating test reports, and `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT/blob/6f27928841c2595c8dd9d08f482c95ca9e42f4b5/native_client/deepspeech.cc"
      }}>{`one in
C++`}</a>{`
which is behind our official client API.`}</p>
    <h3 {...{
      "id": "performance-improvements",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#performance-improvements",
        "aria-label": "performance improvements permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`Performance improvements`}</h3>
    <p>{`What does this all mean for our STT engine? Well, here are some numbers,
compared with our current stable release:`}</p>
    <ul>
      <li parentName="ul">{`Model size down from 468MB to 180MB`}</li>
      <li parentName="ul">{`Time to transcribe: 3s file on a laptop CPU, down from 9s to 1.5s`}</li>
      <li parentName="ul">{`Peak heap usage down from 4GB to 20MB (model is now memory-mapped)`}</li>
      <li parentName="ul">{`Total heap allocations down from 12GB to 264MB`}</li>
    </ul>
    <p>{`Of particular importance to me is that we’re now faster than real time without
using a GPU, which, together with streaming inference, opens up lots of new
usage possibilities like live captioning of radio programs, Twitch streams, and
keynote presentations; home automation; voice-based UIs; and so on. If you’re
looking to integrate speech recognition in your next project, consider using
our engine!`}</p>
    <p>{`Here’s a small Python program that demonstrates how to use libSoX to record
from the microphone and feed it into the engine as the audio is being recorded.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`import argparse
import deepspeech as ds
import numpy as np
import shlex
import subprocess
import sys

parser = argparse.ArgumentParser(description='Coqui STT speech-to-text from microphone')
parser.add_argument('--model', required=True,
                    help='Path to the model (protocol buffer binary file)')
parser.add_argument('--alphabet', required=True,
                    help='Path to the configuration file specifying the alphabet used by the network')
parser.add_argument('--lm', nargs='?',
                    help='Path to the language model binary file')
parser.add_argument('--trie', nargs='?',
                    help='Path to the language model trie file created with native_client/generate_trie')
args = parser.parse_args()

LM_WEIGHT = 1.50
VALID_WORD_COUNT_WEIGHT = 2.25
N_FEATURES = 26
N_CONTEXT = 9
BEAM_WIDTH = 512

print('Initializing model...')

model = ds.Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
if args.lm and args.trie:
    model.enableDecoderWithLM(args.alphabet,
                              args.lm,
                              args.trie,
                              LM_WEIGHT,
                              VALID_WORD_COUNT_WEIGHT)
sctx = model.setupStream()

subproc = subprocess.Popen(shlex.split('rec -q -V0 -e signed -L -c 1 -b 16 -r 16k -t raw - gain -2'),
                           stdout=subprocess.PIPE,
                           bufsize=0)
print('You can start speaking now. Press Control-C to stop recording.')

try:
    while True:
        data = subproc.stdout.read(512)
        model.feedAudioContent(sctx, np.frombuffer(data, np.int16))
except KeyboardInterrupt:
    print('Transcription:', model.finishStream(sctx))
    subproc.terminate()
    subproc.wait()
`}</code></pre>
    <p>{`Finally, if you’re looking to contribute to Project Coqui STT itself, we have
plenty of opportunities. The codebase is written in Python and C++, and we
would love to add iOS and Windows support, for example. Reach out to us via our
`}<a parentName="p" {...{
        "href": "https://gitter.im/coqui-ai/community"
      }}>{`Gitter channel`}</a>{` or our `}<a parentName="p" {...{
        "href": "https://github.com/coqui-ai/STT/discussions"
      }}>{`GitHub
forum`}</a>{`.`}</p>
    <h3 {...{
      "id": "license",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#license",
        "aria-label": "license permalink",
        "className": "anchor before"
      }}><svg parentName="a" {...{
          "xmlns": "http://www.w3.org/2000/svg",
          "width": "16",
          "height": "16",
          "focusable": "false",
          "viewBox": "0 0 16 16"
        }}>{`
  `}<path parentName="svg" {...{
            "fill": "currentColor",
            "d": "M4.441 7.38l.095.083.939.939-.708.707-.939-.939-2 2-.132.142a2.829 2.829 0 003.99 3.99l.142-.132 2-2-.939-.939.707-.708.94.94a1 1 0 01.083 1.32l-.083.094-2 2A3.828 3.828 0 01.972 9.621l.15-.158 2-2A1 1 0 014.34 7.31l.101.07zm7.413-3.234a.5.5 0 01.057.638l-.057.07-7 7a.5.5 0 01-.765-.638l.057-.07 7-7a.5.5 0 01.708 0zm3.023-3.025a3.829 3.829 0 01.15 5.257l-.15.158-2 2a1 1 0 01-1.32.083l-.094-.083-.94-.94.708-.707.939.94 2-2 .132-.142a2.829 2.829 0 00-3.99-3.99l-.142.131-2 2 .939.939-.707.708-.94-.94a1 1 0 01-.082-1.32l.083-.094 2-2a3.828 3.828 0 015.414 0z"
          }}></path>
        </svg></a>{`License`}</h3>
    <p><a parentName="p" {...{
        "href": "https://creativecommons.org/licenses/by-sa/3.0/"
      }}>{`Creative Commons Attribution Share-Alike License v3.0`}</a>{` or any later version`}</p>


    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      