123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/env python3
- # Real-time speech recognition from a microphone with sherpa-ncnn Python API
- # with endpoint detection.
- #
- # Note: This script uses ALSA and works only on Linux systems, especially
- # for embedding Linux systems and for running Linux on Windows using WSL.
- #
- # Please refer to
- # https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
- # to download pre-trained models
- import argparse
- import sys
- import sherpa_ncnn
- def get_args():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
- parser.add_argument(
- "--device-name",
- type=str,
- required=True,
- help="""
- The device name specifies which microphone to use in case there are several
- on your system. You can use
- arecord -l
- to find all available microphones on your computer. For instance, if it outputs
- **** List of CAPTURE Hardware Devices ****
- card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
- Subdevices: 1/1
- Subdevice #0: subdevice #0
- and if you want to select card 3 and the device 0 on that card, please use:
- plughw:3,0
- as the device_name.
- """,
- )
- return parser.parse_args()
- def create_recognizer():
- # Please replace the model files if needed.
- # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
- # for download links.
- recognizer = sherpa_ncnn.Recognizer(
- tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
- encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
- encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
- decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
- decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
- joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
- joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
- num_threads=4,
- decoding_method="modified_beam_search",
- enable_endpoint_detection=True,
- rule1_min_trailing_silence=2.4,
- rule2_min_trailing_silence=1.2,
- rule3_min_utterance_length=300,
- hotwords_file="",
- hotwords_score=1.5,
- )
- return recognizer
- def main():
- args = get_args()
- device_name = args.device_name
- print(f"device_name: {device_name}")
- alsa = sherpa_ncnn.Alsa(device_name)
- recognizer = create_recognizer()
- print("Started! Please speak")
- sample_rate = recognizer.sample_rate
- samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
- last_result = ""
- segment_id = 0
- while True:
- samples = alsa.read(samples_per_read) # a blocking read
- recognizer.accept_waveform(sample_rate, samples)
- is_endpoint = recognizer.is_endpoint
- result = recognizer.text
- if result and (last_result != result):
- last_result = result
- print("\r{}:{}".format(segment_id, result), end="", flush=True)
- if is_endpoint:
- if result:
- print("\r{}:{}".format(segment_id, result), flush=True)
- segment_id += 1
- recognizer.reset()
- if __name__ == "__main__":
- try:
- main()
- except KeyboardInterrupt:
- print("\nCaught Ctrl + C. Exiting")
|