speech-recognition-from-microphone-with-endpoint-detection-alsa.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/env python3
  2. # Real-time speech recognition from a microphone with sherpa-ncnn Python API
  3. # with endpoint detection.
  4. #
  5. # Note: This script uses ALSA and works only on Linux systems, especially
  6. # for embedding Linux systems and for running Linux on Windows using WSL.
  7. #
  8. # Please refer to
  9. # https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
  10. # to download pre-trained models
  11. import argparse
  12. import sys
  13. import sherpa_ncnn
  14. def get_args():
  15. parser = argparse.ArgumentParser(
  16. formatter_class=argparse.ArgumentDefaultsHelpFormatter
  17. )
  18. parser.add_argument(
  19. "--device-name",
  20. type=str,
  21. required=True,
  22. help="""
  23. The device name specifies which microphone to use in case there are several
  24. on your system. You can use
  25. arecord -l
  26. to find all available microphones on your computer. For instance, if it outputs
  27. **** List of CAPTURE Hardware Devices ****
  28. card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  29. Subdevices: 1/1
  30. Subdevice #0: subdevice #0
  31. and if you want to select card 3 and the device 0 on that card, please use:
  32. plughw:3,0
  33. as the device_name.
  34. """,
  35. )
  36. return parser.parse_args()
  37. def create_recognizer():
  38. # Please replace the model files if needed.
  39. # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
  40. # for download links.
  41. recognizer = sherpa_ncnn.Recognizer(
  42. tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
  43. encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
  44. encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
  45. decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
  46. decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
  47. joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
  48. joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
  49. num_threads=4,
  50. decoding_method="modified_beam_search",
  51. enable_endpoint_detection=True,
  52. rule1_min_trailing_silence=2.4,
  53. rule2_min_trailing_silence=1.2,
  54. rule3_min_utterance_length=300,
  55. hotwords_file="",
  56. hotwords_score=1.5,
  57. )
  58. return recognizer
  59. def main():
  60. args = get_args()
  61. device_name = args.device_name
  62. print(f"device_name: {device_name}")
  63. alsa = sherpa_ncnn.Alsa(device_name)
  64. recognizer = create_recognizer()
  65. print("Started! Please speak")
  66. sample_rate = recognizer.sample_rate
  67. samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  68. last_result = ""
  69. segment_id = 0
  70. while True:
  71. samples = alsa.read(samples_per_read) # a blocking read
  72. recognizer.accept_waveform(sample_rate, samples)
  73. is_endpoint = recognizer.is_endpoint
  74. result = recognizer.text
  75. if result and (last_result != result):
  76. last_result = result
  77. print("\r{}:{}".format(segment_id, result), end="", flush=True)
  78. if is_endpoint:
  79. if result:
  80. print("\r{}:{}".format(segment_id, result), flush=True)
  81. segment_id += 1
  82. recognizer.reset()
  83. if __name__ == "__main__":
  84. try:
  85. main()
  86. except KeyboardInterrupt:
  87. print("\nCaught Ctrl + C. Exiting")