real-time-speech-recognition-microphone.js 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. // Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang)
  2. //
  3. const portAudio = require('naudiodon2');
  4. // console.log(portAudio.getDevices());
  5. const sherpa_ncnn = require('sherpa-ncnn');
  6. function createRecognizer() {
  7. let modelConfig = {
  8. encoderParam:
  9. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param',
  10. encoderBin:
  11. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin',
  12. decoderParam:
  13. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param',
  14. decoderBin:
  15. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin',
  16. joinerParam:
  17. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param',
  18. joinerBin:
  19. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin',
  20. tokens:
  21. './sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt',
  22. useVulkanCompute: 0,
  23. numThreads: 1,
  24. };
  25. let decoderConfig = {
  26. decodingMethod: 'greedy_search',
  27. numActivePaths: 4,
  28. };
  29. let featConfig = {
  30. samplingRate: 16000,
  31. featureDim: 80,
  32. };
  33. let config = {
  34. featConfig: featConfig,
  35. modelConfig: modelConfig,
  36. decoderConfig: decoderConfig,
  37. enableEndpoint: 1,
  38. rule1MinTrailingSilence: 1.2,
  39. rule2MinTrailingSilence: 2.4,
  40. rule3MinUtternceLength: 20,
  41. };
  42. return sherpa_ncnn.createRecognizer(config);
  43. }
  44. const recognizer = createRecognizer();
  45. const stream = recognizer.createStream();
  46. let lastText = '';
  47. let segmentIndex = 0;
  48. const ai = new portAudio.AudioIO({
  49. inOptions: {
  50. channelCount: 1,
  51. closeOnError: true, // Close the stream if an audio error is detected, if
  52. // set false then just log the error
  53. deviceId: -1, // Use -1 or omit the deviceId to select the default device
  54. sampleFormat: portAudio.SampleFormatFloat32,
  55. sampleRate: recognizer.config.featConfig.samplingRate
  56. }
  57. });
  58. ai.on('data', data => {
  59. const samples = new Float32Array(data.buffer);
  60. stream.acceptWaveform(recognizer.config.featConfig.samplingRate, samples);
  61. while (recognizer.isReady(stream)) {
  62. recognizer.decode(stream);
  63. }
  64. const isEndpoint = recognizer.isEndpoint(stream);
  65. const text = recognizer.getResult(stream);
  66. if (text.length > 0 && lastText != text) {
  67. lastText = text;
  68. console.log(segmentIndex, lastText);
  69. }
  70. if (isEndpoint) {
  71. if (text.length > 0) {
  72. lastText = text;
  73. segmentIndex += 1;
  74. }
  75. recognizer.reset(stream)
  76. }
  77. });
  78. ai.on('close', () => {
  79. console.log('Free resources');
  80. stream.free();
  81. recognizer.free();
  82. });
  83. ai.start();
  84. console.log('Started! Please speak')