2 lat temu · 2e78bbf587
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ We support all platforms that [ncnn](https://github.com/tencent/ncnn) supports.
 
				 Everything can be compiled from source with static link. The generated
			
 
				 executable depends only on system libraries.
			
 
				 
			
 
				+**HINT**: It does not depend on PyTorch or any other inference frameworks
			
 
				+other than [ncnn](https://github.com/tencent/ncnn).
			
 
				+
			
 
				 Please see the documentation <https://k2-fsa.github.io/sherpa/ncnn/index.html>
			
 
				 for installation and usages, e.g.,
			
 
				 
			
@@ -24,21 +27,18 @@ for installation and usages, e.g.,
 
				 We provide a few YouTube videos for demonstration about real-time speech recognition
			
 
				 with `sherpa-ncnn` using a microphone:
			
 
				 
			
 
				-  - `English`: <https://www.youtube.com/watch?v=m6ynSxycpX0>
			
 
				-  - `Chinese`: <https://www.youtube.com/watch?v=bbQfoRT75oM>
			
 
				-  - `Chinese + English` Android demo: <https://www.youtube.com/shorts/S5Owcrb8vzU>
			
 
				-  - `Chinese (with background noise)` Android demo : <https://www.youtube.com/shorts/KI1-d-W9uZw>
			
 
				-  - `Chinese` Android demo : <https://www.youtube.com/shorts/lpDAG36T1R4>
			
 
				-  - `Chinese poem with background music` Android demo : <https://www.youtube.com/shorts/5CJ-r8VNuwo>
			
 
				-
			
 
				-**Note**: If you don't have access to YouTube, we provide the links
			
 
				-in bilibili below:
			
 
				-
			
 
				   - `English`: <https://www.bilibili.com/video/BV1TP411p7dh/>
			
 
				   - `Chinese`: <https://www.bilibili.com/video/BV1214y177vu>
			
 
				-  - `Chinese + English` Android demo: <https://www.bilibili.com/video/BV1Ge411A7XS>
			
 
				+
			
 
				+  - Multilingual (Chinese + English) with endpointing Python demo : <https://www.bilibili.com/video/BV1eK411y788/>
			
 
				+
			
 
				+  - **Android demos**
			
 
				+
			
 
				+  - Multilingual (Chinese + English) Android demo 1: <https://www.bilibili.com/video/BV1Ge411A7XS>
			
 
				+  - Multilingual (Chinese + English) Android demo 2: <https://www.bilibili.com/video/BV1eK411y788/>
			
 
				   - `Chinese (with background noise)` Android demo : <https://www.bilibili.com/video/BV1GR4y167fx>
			
 
				   - `Chinese` Android demo : <https://www.bilibili.com/video/BV1744y1Z76H>
			
 
				   - `Chinese poem with background music` Android demo : <https://www.bilibili.com/video/BV1vR4y1k7eo>
			
 
				 
			
 
				+
			
 
				 See also <https://github.com/k2-fsa/sherpa>
			
--- a/android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/MainActivity.kt
+++ b/android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/MainActivity.kt
@@ -173,7 +173,7 @@ class MainActivity : AppCompatActivity() {
 
				         model = SherpaNcnn(
			
 
				             assetManager = application.assets,
			
 
				             modelConfig = getModelConfig(type = 1, useGPU = useGPU)!!,
			
 
				-            decoderConfig=getDecoderConfig(useEndpoint = true),
			
 
				+            decoderConfig = getDecoderConfig(enableEndpoint = true),
			
 
				             fbankConfig = getFbankConfig(),
			
 
				         )
			
 
				     }
			
--- a/android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/SherpaNcnn.kt
+++ b/android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/SherpaNcnn.kt
@@ -17,7 +17,7 @@ data class EndpointConfig(
 
				 data class DecoderConfig(
			
 
				     var method: String = "modified_beam_search", // valid values: greedy_search, modified_beam_search
			
 
				     var numActivePaths: Int = 4, // used only by modified_beam_search
			
 
				-    var useEndpoint: Boolean = true,
			
 
				+    var enableEndpoint: Boolean = true,
			
 
				     var endpointConfig: EndpointConfig = EndpointConfig(),
			
 
				 )
			
 
				 
			
@@ -169,11 +169,11 @@ fun getModelConfig(type: Int, useGPU: Boolean): ModelConfig? {
 
				     return null
			
 
				 }
			
 
				 
			
 
				-fun getDecoderConfig(useEndpoint: Boolean): DecoderConfig {
			
 
				+fun getDecoderConfig(enableEndpoint: Boolean): DecoderConfig {
			
 
				     return DecoderConfig(
			
 
				         method = "modified_beam_search",
			
 
				         numActivePaths = 4,
			
 
				-        useEndpoint = useEndpoint,
			
 
				+        enableEndpoint = enableEndpoint,
			
 
				         endpointConfig = EndpointConfig(
			
 
				             rule1 = EndpointRule(false, 2.4f, 0.0f),
			
 
				             rule2 = EndpointRule(true, 1.4f, 0.0f),
			
--- a/python-api-examples/README.md
+++ b/python-api-examples/README.md
@@ -7,3 +7,15 @@ This file shows how to recognize a file.
 
				 ## speech-recognition-from-microphone.py
			
 
				 
			
 
				 This file demonstrates how to do real-time speech recognition with a microphone.
			
 
				+
			
 
				+You can find video demos about this file at the following addresses:
			
 
				+
			
 
				+  - https://www.bilibili.com/video/BV1K44y197Fg/
			
 
				+  - https://www.youtube.com/watch?v=74SxVueROok
			
 
				+
			
 
				+## speech-recognition-from-microphone-with-endpoint-detection.py
			
 
				+
			
 
				+Similar to `speech-recognition-from-microphone.py` but it also enables
			
 
				+endpoint detection.
			
 
				+
			
 
				+You can find a video demo about this file at <https://www.bilibili.com/video/BV1eK411y788>
			
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
@@ -0,0 +1,80 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+# Real-time speech recognition from a microphone with sherpa-ncnn Python API
			
 
				+# with endpoint detection.
			
 
				+#
			
 
				+# Please refer to
			
 
				+# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
			
 
				+# to download pre-trained models
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+try:
			
 
				+    import sounddevice as sd
			
 
				+except ImportError as e:
			
 
				+    print("Please install sounddevice first. You can use")
			
 
				+    print()
			
 
				+    print("  pip install sounddevice")
			
 
				+    print()
			
 
				+    print("to install it")
			
 
				+    sys.exit(-1)
			
 
				+
			
 
				+import sherpa_ncnn
			
 
				+
			
 
				+
			
 
				+def create_recognizer():
			
 
				+    # Please replace the model files if needed.
			
 
				+    # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
			
 
				+    # for download links.
			
 
				+    recognizer = sherpa_ncnn.Recognizer(
			
 
				+        tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
			
 
				+        encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
			
 
				+        encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
			
 
				+        decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
			
 
				+        decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
			
 
				+        joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
			
 
				+        joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
			
 
				+        num_threads=4,
			
 
				+        decoding_method="modified_beam_search",
			
 
				+        enable_endpoint_detection=True,
			
 
				+        rule1_min_trailing_silence=2.4,
			
 
				+        rule2_min_trailing_silence=1.2,
			
 
				+        rule3_min_utterance_length=300,
			
 
				+    )
			
 
				+    return recognizer
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    print("Started! Please speak")
			
 
				+    recognizer = create_recognizer()
			
 
				+    sample_rate = recognizer.sample_rate
			
 
				+    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
			
 
				+    last_result = ""
			
 
				+    segment_id = 0
			
 
				+    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
			
 
				+        while True:
			
 
				+            samples, _ = s.read(samples_per_read)  # a blocking read
			
 
				+            samples = samples.reshape(-1)
			
 
				+            recognizer.accept_waveform(sample_rate, samples)
			
 
				+
			
 
				+            is_endpoint = recognizer.is_endpoint
			
 
				+
			
 
				+            result = recognizer.text
			
 
				+            if result and (last_result != result):
			
 
				+                last_result = result
			
 
				+                print(f"{segment_id}: {result}")
			
 
				+
			
 
				+            if result and is_endpoint:
			
 
				+                segment_id += 1
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    devices = sd.query_devices()
			
 
				+    print(devices)
			
 
				+    default_input_device_idx = sd.default.device[0]
			
 
				+    print(f'Use default device: {devices[default_input_device_idx]["name"]}')
			
 
				+
			
 
				+    try:
			
 
				+        main()
			
 
				+    except KeyboardInterrupt:
			
 
				+        print("\nCaught Ctrl + C. Exiting")
			
--- a/python-api-examples/speech-recognition-from-microphone.py
+++ b/python-api-examples/speech-recognition-from-microphone.py
@@ -42,7 +42,7 @@ def main():
 
				     print("Started! Please speak")
			
 
				     recognizer = create_recognizer()
			
 
				     sample_rate = recognizer.sample_rate
			
 
				-    samples_per_read = int(0.02 * sample_rate)  # 20ms
			
 
				+    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
			
 
				     last_result = ""
			
 
				     with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
			
 
				         while True:
			
--- a/sherpa-ncnn/csrc/CPPLINT.cfg
+++ b/sherpa-ncnn/csrc/CPPLINT.cfg
@@ -0,0 +1 @@
 
				+exclude_files=generate-int8-scale-table.cc
			
--- a/sherpa-ncnn/csrc/endpoint.h
+++ b/sherpa-ncnn/csrc/endpoint.h
@@ -18,6 +18,7 @@
 
				 #ifndef SHERPA_NCNN_CSRC_ENDPOINT_H_
			
 
				 #define SHERPA_NCNN_CSRC_ENDPOINT_H_
			
 
				 
			
 
				+#include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
@@ -54,6 +55,10 @@ struct EndpointConfig {
 
				   EndpointRule rule2;
			
 
				   EndpointRule rule3;
			
 
				 
			
 
				+  EndpointConfig(const EndpointRule &rule1, const EndpointRule &rule2,
			
 
				+                 const EndpointRule &rule3)
			
 
				+      : rule1(rule1), rule2(rule2), rule3(rule3) {}
			
 
				+
			
 
				   EndpointConfig()
			
 
				       : rule1(false, 2.4, 0), rule2(true, 1.4, 0), rule3(false, 0, 20) {}
			
 
				 
			
--- a/sherpa-ncnn/csrc/greedy-search-decoder.cc
+++ b/sherpa-ncnn/csrc/greedy-search-decoder.cc
@@ -21,7 +21,7 @@
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-void GreedySearchDecoder::AcceptWaveform(const int32_t sample_rate,
			
 
				+void GreedySearchDecoder::AcceptWaveform(const float sample_rate,
			
 
				                                          const float *input_buffer,
			
 
				                                          int32_t frames_per_buffer) {
			
 
				   feature_extractor_.AcceptWaveform(sample_rate, input_buffer,
			
@@ -77,7 +77,7 @@ void GreedySearchDecoder::Decode() {
 
				 
			
 
				 RecognitionResult GreedySearchDecoder::GetResult() {
			
 
				   auto ans = result_;
			
 
				-  if (config_.use_endpoint && IsEndpoint()) {
			
 
				+  if (config_.enable_endpoint && IsEndpoint()) {
			
 
				     ResetResult();
			
 
				     endpoint_start_frame_ = num_processed_;
			
 
				   }
			
--- a/sherpa-ncnn/csrc/greedy-search-decoder.h
+++ b/sherpa-ncnn/csrc/greedy-search-decoder.h
@@ -51,7 +51,7 @@ class GreedySearchDecoder : public Decoder {
 
				     decoder_out_ = model_->RunDecoder(decoder_input_);
			
 
				   }
			
 
				 
			
 
				-  void AcceptWaveform(int32_t sample_rate, const float *input_buffer,
			
 
				+  void AcceptWaveform(float sample_rate, const float *input_buffer,
			
 
				                       int32_t frames_per_buffer) override;
			
 
				 
			
 
				   void Decode() override;
			
--- a/sherpa-ncnn/csrc/modified-beam-search-decoder.cc
+++ b/sherpa-ncnn/csrc/modified-beam-search-decoder.cc
@@ -25,7 +25,7 @@
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-void ModifiedBeamSearchDecoder::AcceptWaveform(const int32_t sample_rate,
			
 
				+void ModifiedBeamSearchDecoder::AcceptWaveform(const float sample_rate,
			
 
				                                                const float *input_buffer,
			
 
				                                                int32_t frames_per_buffer) {
			
 
				   feature_extractor_.AcceptWaveform(sample_rate, input_buffer,
			
@@ -113,7 +113,7 @@ RecognitionResult ModifiedBeamSearchDecoder::GetResult() {
 
				   result_.num_trailing_blanks = best_hyp.num_trailing_blanks;
			
 
				   auto ans = result_;
			
 
				 
			
 
				-  if (config_.use_endpoint && IsEndpoint()) {
			
 
				+  if (config_.enable_endpoint && IsEndpoint()) {
			
 
				     ResetResult();
			
 
				     endpoint_start_frame_ = num_processed_;
			
 
				   }
			
--- a/sherpa-ncnn/csrc/modified-beam-search-decoder.h
+++ b/sherpa-ncnn/csrc/modified-beam-search-decoder.h
@@ -49,7 +49,7 @@ class ModifiedBeamSearchDecoder : public Decoder {
 
				     ResetResult();
			
 
				   }
			
 
				 
			
 
				-  void AcceptWaveform(int32_t sample_rate, const float *input_buffer,
			
 
				+  void AcceptWaveform(float sample_rate, const float *input_buffer,
			
 
				                       int32_t frames_per_buffer) override;
			
 
				 
			
 
				   void Decode() override;
			
--- a/sherpa-ncnn/csrc/recognizer.cc
+++ b/sherpa-ncnn/csrc/recognizer.cc
@@ -17,6 +17,8 @@
 
				  * limitations under the License.
			
 
				  */
			
 
				 
			
 
				+#include "sherpa-ncnn/csrc/recognizer.h"
			
 
				+
			
 
				 #include <memory>
			
 
				 #include <string>
			
 
				 #include <vector>
			
@@ -32,7 +34,7 @@ std::string DecoderConfig::ToString() const {
 
				   os << "DecoderConfig(";
			
 
				   os << "method=\"" << method << "\", ";
			
 
				   os << "num_active_paths=" << num_active_paths << ", ";
			
 
				-  os << "use_endpoint=" << (use_endpoint ? "True" : "False") << ", ";
			
 
				+  os << "enable_endpoint=" << (enable_endpoint ? "True" : "False") << ", ";
			
 
				   os << "endpoint_config=" << endpoint_config.ToString() << ")";
			
 
				 
			
 
				   return os.str();
			
@@ -42,8 +44,8 @@ Recognizer::Recognizer(
 
				 #if __ANDROID_API__ >= 9
			
 
				     AAssetManager *mgr,
			
 
				 #endif
			
 
				-    const DecoderConfig decoder_conf, const ModelConfig model_conf,
			
 
				-    const knf::FbankOptions fbank_opts)
			
 
				+    const DecoderConfig &decoder_conf, const ModelConfig &model_conf,
			
 
				+    const knf::FbankOptions &fbank_opts)
			
 
				     :
			
 
				 #if __ANDROID_API__ >= 9
			
 
				       model_(Model::Create(mgr, model_conf)),
			
@@ -65,7 +67,7 @@ Recognizer::Recognizer(
 
				   }
			
 
				 }
			
 
				 
			
 
				-void Recognizer::AcceptWaveform(int32_t sample_rate, const float *input_buffer,
			
 
				+void Recognizer::AcceptWaveform(float sample_rate, const float *input_buffer,
			
 
				                                 int32_t frames_per_buffer) {
			
 
				   decoder_->AcceptWaveform(sample_rate, input_buffer, frames_per_buffer);
			
 
				 }
			
--- a/sherpa-ncnn/csrc/recognizer.h
+++ b/sherpa-ncnn/csrc/recognizer.h
@@ -32,6 +32,7 @@
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				+// TODO(fangjun): Add timestamps
			
 
				 struct RecognitionResult {
			
 
				   std::vector<int32_t> tokens;
			
 
				   std::string text;
			
@@ -47,9 +48,19 @@ struct DecoderConfig {
 
				 
			
 
				   int32_t num_active_paths = 4;  // for modified beam search
			
 
				 
			
 
				-  bool use_endpoint = true;
			
 
				+  bool enable_endpoint = false;
			
 
				 
			
 
				   EndpointConfig endpoint_config;
			
 
				+
			
 
				+  DecoderConfig() = default;
			
 
				+
			
 
				+  DecoderConfig(const std::string &method, int32_t num_active_paths,
			
 
				+                bool enable_endpoint, const EndpointConfig &endpoint_config)
			
 
				+      : method(method),
			
 
				+        num_active_paths(num_active_paths),
			
 
				+        enable_endpoint(enable_endpoint),
			
 
				+        endpoint_config(endpoint_config) {}
			
 
				+
			
 
				   std::string ToString() const;
			
 
				 };
			
 
				 
			
@@ -57,7 +68,7 @@ class Decoder {
 
				  public:
			
 
				   virtual ~Decoder() = default;
			
 
				 
			
 
				-  virtual void AcceptWaveform(int32_t sample_rate, const float *input_buffer,
			
 
				+  virtual void AcceptWaveform(float sample_rate, const float *input_buffer,
			
 
				                               int32_t frames_per_buffer) = 0;
			
 
				 
			
 
				   virtual void Decode() = 0;
			
@@ -81,12 +92,12 @@ class Recognizer {
 
				 #if __ANDROID_API__ >= 9
			
 
				       AAssetManager *mgr,
			
 
				 #endif
			
 
				-      const DecoderConfig decoder_conf, const ModelConfig model_conf,
			
 
				-      const knf::FbankOptions fbank_opts);
			
 
				+      const DecoderConfig &decoder_conf, const ModelConfig &model_conf,
			
 
				+      const knf::FbankOptions &fbank_opts);
			
 
				 
			
 
				   ~Recognizer() = default;
			
 
				 
			
 
				-  void AcceptWaveform(int32_t sample_rate, const float *input_buffer,
			
 
				+  void AcceptWaveform(float sample_rate, const float *input_buffer,
			
 
				                       int32_t frames_per_buffer);
			
 
				 
			
 
				   void Decode();
			
--- a/sherpa-ncnn/jni/jni.cc
+++ b/sherpa-ncnn/jni/jni.cc
@@ -144,8 +144,8 @@ static DecoderConfig GetDecoderConfig(JNIEnv *env, jobject config) {
 
				   fid = env->GetFieldID(cls, "numActivePaths", "I");
			
 
				   decoder_config.num_active_paths = env->GetIntField(config, fid);
			
 
				 
			
 
				-  fid = env->GetFieldID(cls, "useEndpoint", "Z");
			
 
				-  decoder_config.use_endpoint = env->GetBooleanField(config, fid);
			
 
				+  fid = env->GetFieldID(cls, "enableEndpoint", "Z");
			
 
				+  decoder_config.enable_endpoint = env->GetBooleanField(config, fid);
			
 
				 
			
 
				   fid = env->GetFieldID(cls, "endpointConfig",
			
 
				                         "Lcom/k2fsa/sherpa/ncnn/EndpointConfig;");
			
--- a/sherpa-ncnn/python/csrc/CMakeLists.txt
+++ b/sherpa-ncnn/python/csrc/CMakeLists.txt
@@ -1,10 +1,9 @@
 
				 
			
 
				 include_directories(${PROJECT_SOURCE_DIR})
			
 
				 set(srcs
			
 
				-  decode.cc
			
 
				-  features.cc
			
 
				-  mat-util.cc
			
 
				+  endpoint.cc
			
 
				   model.cc
			
 
				+  recognizer.cc
			
 
				   sherpa-ncnn.cc
			
 
				 )
			
 
				 
			
--- a/sherpa-ncnn/python/csrc/decode.cc
+++ b/sherpa-ncnn/python/csrc/decode.cc
@@ -1,46 +0,0 @@
 
				-/**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				- *
			
 
				- * See LICENSE for clarification regarding multiple authors
			
 
				- *
			
 
				- * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				- * you may not use this file except in compliance with the License.
			
 
				- * You may obtain a copy of the License at
			
 
				- *
			
 
				- *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				- *
			
 
				- * Unless required by applicable law or agreed to in writing, software
			
 
				- * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				- * See the License for the specific language governing permissions and
			
 
				- * limitations under the License.
			
 
				- */
			
 
				-
			
 
				-#include "sherpa-ncnn/python/csrc/decode.h"
			
 
				-
			
 
				-#include "sherpa-ncnn/csrc/decode.h"
			
 
				-#include "sherpa-ncnn/csrc/model.h"
			
 
				-#include "sherpa-ncnn/python/csrc/mat-util.h"
			
 
				-
			
 
				-namespace sherpa_ncnn {
			
 
				-
			
 
				-static void PybindGreedySearch(py::module *m) {
			
 
				-  m->def(
			
 
				-      "greedy_search",
			
 
				-      [](Model *model, py::array _encoder_out, py::array _decoder_out,
			
 
				-         std::vector<int32_t> hyp)
			
 
				-          -> std::pair<py::array, std::vector<int32_t>> {
			
 
				-        ncnn::Mat encoder_out = ArrayToMat(_encoder_out);
			
 
				-        ncnn::Mat decoder_out = ArrayToMat(_decoder_out);
			
 
				-
			
 
				-        GreedySearch(model, encoder_out, &decoder_out, &hyp);
			
 
				-
			
 
				-        return {MatToArray(decoder_out), hyp};
			
 
				-      },
			
 
				-      py::arg("model"), py::arg("encoder_out"), py::arg("decoder_out"),
			
 
				-      py::arg("hyp"));
			
 
				-}
			
 
				-
			
 
				-void PybindDecode(py::module *m) { PybindGreedySearch(m); }
			
 
				-
			
 
				-}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/endpoint.cc
+++ b/sherpa-ncnn/python/csrc/endpoint.cc
@@ -0,0 +1,114 @@
 
				+/**
			
 
				+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "sherpa-ncnn/python/csrc/endpoint.h"
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+
			
 
				+#include "sherpa-ncnn/csrc/endpoint.h"
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+
			
 
				+static constexpr const char *kEndpointRuleInitDoc = R"doc(
			
 
				+Constructor for EndpointRule.
			
 
				+
			
 
				+Args:
			
 
				+  must_contain_nonsilence:
			
 
				+    If True, for this endpointing rule to apply there must be nonsilence in the
			
 
				+    best-path traceback. For decoding, a non-blank token is considered as
			
 
				+    non-silence.
			
 
				+  min_trailing_silence:
			
 
				+    This endpointing rule requires duration of trailing silence (in seconds)
			
 
				+    to be ``>=`` this value.
			
 
				+  min_utterance_length:
			
 
				+    This endpointing rule requires utterance-length (in seconds) to
			
 
				+    be ``>=`` this value.
			
 
				+)doc";
			
 
				+
			
 
				+static constexpr const char *kEndpointConfigInitDoc = R"doc(
			
 
				+If any rule in EndpointConfig is activated, it is said that an endpointing
			
 
				+is detected.
			
 
				+
			
 
				+Args:
			
 
				+  rule1:
			
 
				+    By default, it times out after 2.4 seconds of silence, even if
			
 
				+    we decoded nothing.
			
 
				+  rule2:
			
 
				+    By default, it times out after 1.2 seconds of silence after decoding
			
 
				+    something.
			
 
				+  rule3:
			
 
				+    By default, it times out after the utterance is 20 seconds long, regardless of
			
 
				+    anything else.
			
 
				+)doc";
			
 
				+
			
 
				+static void PybindEndpointRule(py::module *m) {
			
 
				+  using PyClass = EndpointRule;
			
 
				+  py::class_<PyClass>(*m, "EndpointRule")
			
 
				+      .def(py::init<bool, float, float>(), py::arg("must_contain_nonsilence"),
			
 
				+           py::arg("min_trailing_silence"), py::arg("min_utterance_length"),
			
 
				+           kEndpointRuleInitDoc)
			
 
				+      .def("__str__", &PyClass::ToString)
			
 
				+      .def_readwrite("must_contain_nonsilence",
			
 
				+                     &PyClass::must_contain_nonsilence)
			
 
				+      .def_readwrite("min_trailing_silence", &PyClass::min_trailing_silence)
			
 
				+      .def_readwrite("min_utterance_length", &PyClass::min_utterance_length);
			
 
				+}
			
 
				+
			
 
				+static void PybindEndpointConfig(py::module *m) {
			
 
				+  using PyClass = EndpointConfig;
			
 
				+  py::class_<PyClass>(*m, "EndpointConfig")
			
 
				+      .def(
			
 
				+          py::init(
			
 
				+              [](float rule1_min_trailing_silence,
			
 
				+                 float rule2_min_trailing_silence,
			
 
				+                 float rule3_min_utterance_length) -> std::unique_ptr<PyClass> {
			
 
				+                EndpointRule rule1(false, rule1_min_trailing_silence, 0);
			
 
				+                EndpointRule rule2(true, rule2_min_trailing_silence, 0);
			
 
				+                EndpointRule rule3(false, 0, rule3_min_utterance_length);
			
 
				+
			
 
				+                return std::make_unique<EndpointConfig>(rule1, rule2, rule3);
			
 
				+              }),
			
 
				+          py::arg("rule1_min_trailing_silence"),
			
 
				+          py::arg("rule2_min_trailing_silence"),
			
 
				+          py::arg("rule3_min_utterance_length"))
			
 
				+      .def(py::init([](const EndpointRule &rule1, const EndpointRule &rule2,
			
 
				+                       const EndpointRule &rule3) -> std::unique_ptr<PyClass> {
			
 
				+             auto ans = std::make_unique<PyClass>();
			
 
				+             ans->rule1 = rule1;
			
 
				+             ans->rule2 = rule2;
			
 
				+             ans->rule3 = rule3;
			
 
				+             return ans;
			
 
				+           }),
			
 
				+           py::arg("rule1") = EndpointRule(false, 2.4, 0),
			
 
				+           py::arg("rule2") = EndpointRule(true, 1.2, 0),
			
 
				+           py::arg("rule3") = EndpointRule(false, 0, 20),
			
 
				+           kEndpointConfigInitDoc)
			
 
				+      .def("__str__",
			
 
				+           [](const PyClass &self) -> std::string { return self.ToString(); })
			
 
				+      .def_readwrite("rule1", &PyClass::rule1)
			
 
				+      .def_readwrite("rule2", &PyClass::rule2)
			
 
				+      .def_readwrite("rule3", &PyClass::rule3);
			
 
				+}
			
 
				+
			
 
				+void PybindEndpoint(py::module *m) {
			
 
				+  PybindEndpointRule(m);
			
 
				+  PybindEndpointConfig(m);
			
 
				+}
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/endpoint.h
+++ b/sherpa-ncnn/python/csrc/endpoint.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				  *
			
 
				  * See LICENSE for clarification regarding multiple authors
			
 
				  *
			
@@ -16,15 +16,15 @@
 
				  * limitations under the License.
			
 
				  */
			
 
				 
			
 
				-#ifndef SHERPA_NCNN_PYTHON_CSRC_DECODE_H_
			
 
				-#define SHERPA_NCNN_PYTHON_CSRC_DECODE_H_
			
 
				+#ifndef SHERPA_NCNN_PYTHON_CSRC_ENDPOINT_H_
			
 
				+#define SHERPA_NCNN_PYTHON_CSRC_ENDPOINT_H_
			
 
				 
			
 
				 #include "sherpa-ncnn/python/csrc/sherpa-ncnn.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-void PybindDecode(py::module *m);
			
 
				+void PybindEndpoint(py::module *m);
			
 
				 
			
 
				 }  // namespace sherpa_ncnn
			
 
				 
			
 
				-#endif  // SHERPA_NCNN_PYTHON_CSRC_DECODE_H_
			
 
				+#endif  // SHERPA_NCNN_PYTHON_CSRC_ENDPOINT_H_
			
--- a/sherpa-ncnn/python/csrc/features.cc
+++ b/sherpa-ncnn/python/csrc/features.cc
@@ -1,56 +0,0 @@
 
				-/**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				- *
			
 
				- * See LICENSE for clarification regarding multiple authors
			
 
				- *
			
 
				- * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				- * you may not use this file except in compliance with the License.
			
 
				- * You may obtain a copy of the License at
			
 
				- *
			
 
				- *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				- *
			
 
				- * Unless required by applicable law or agreed to in writing, software
			
 
				- * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				- * See the License for the specific language governing permissions and
			
 
				- * limitations under the License.
			
 
				- */
			
 
				-
			
 
				-#include "sherpa-ncnn/csrc/features.h"
			
 
				-
			
 
				-#include "sherpa-ncnn/python/csrc/mat-util.h"
			
 
				-#include "sherpa-ncnn/python/csrc/sherpa-ncnn.h"
			
 
				-
			
 
				-namespace sherpa_ncnn {
			
 
				-
			
 
				-void PybindFeatures(py::module *m) {
			
 
				-  using PyClass = FeatureExtractor;
			
 
				-
			
 
				-  py::class_<PyClass>(*m, "FeatureExtractor")
			
 
				-      .def(py::init([](int32_t feature_dim,
			
 
				-                       float sample_rate) -> std::unique_ptr<PyClass> {
			
 
				-             knf::FbankOptions fbank_opts;
			
 
				-             fbank_opts.frame_opts.dither = 0;
			
 
				-             fbank_opts.frame_opts.snip_edges = false;
			
 
				-             fbank_opts.frame_opts.samp_freq = sample_rate;
			
 
				-             fbank_opts.mel_opts.num_bins = feature_dim;
			
 
				-
			
 
				-             return std::make_unique<PyClass>(fbank_opts);
			
 
				-           }),
			
 
				-           py::arg("feature_dim"), py::arg("sample_rate"))
			
 
				-      .def("accept_waveform",
			
 
				-           [](PyClass &self, float sample_rate, py::array_t<float> waveform) {
			
 
				-             self.AcceptWaveform(sample_rate, waveform.data(), waveform.size());
			
 
				-           })
			
 
				-      .def("input_finished", &PyClass::InputFinished)
			
 
				-      .def_property_readonly("num_frames_ready", &PyClass::NumFramesReady)
			
 
				-      .def("is_last_frame", &PyClass::IsLastFrame, py::arg("frame"))
			
 
				-      .def("get_frames",
			
 
				-           [](PyClass &self, int32_t frame_index, int32_t n) -> py::array {
			
 
				-             ncnn::Mat frames = self.GetFrames(frame_index, n);
			
 
				-             return MatToArray(frames);
			
 
				-           })
			
 
				-      .def("reset", &PyClass::Reset);
			
 
				-}
			
 
				-
			
 
				-}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/mat-util.cc
+++ b/sherpa-ncnn/python/csrc/mat-util.cc
@@ -1,97 +0,0 @@
 
				-/**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				- *
			
 
				- * See LICENSE for clarification regarding multiple authors
			
 
				- *
			
 
				- * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				- * you may not use this file except in compliance with the License.
			
 
				- * You may obtain a copy of the License at
			
 
				- *
			
 
				- *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				- *
			
 
				- * Unless required by applicable law or agreed to in writing, software
			
 
				- * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				- * See the License for the specific language governing permissions and
			
 
				- * limitations under the License.
			
 
				- */
			
 
				-
			
 
				-#include "sherpa-ncnn/python/csrc/mat-util.h"
			
 
				-
			
 
				-namespace sherpa_ncnn {
			
 
				-
			
 
				-struct KeepMatAlive {
			
 
				-  explicit KeepMatAlive(ncnn::Mat m) : m(m) {}
			
 
				-
			
 
				-  ncnn::Mat m;
			
 
				-};
			
 
				-
			
 
				-py::array_t<float> MatToArray(ncnn::Mat m) {
			
 
				-  std::vector<py::ssize_t> shape;
			
 
				-  std::vector<py::ssize_t> strides;
			
 
				-  if (m.dims == 1) {
			
 
				-    shape.push_back(m.w);
			
 
				-    strides.push_back(m.elemsize);
			
 
				-  } else if (m.dims == 2) {
			
 
				-    shape.push_back(m.h);
			
 
				-    shape.push_back(m.w);
			
 
				-    strides.push_back(m.w * m.elemsize);
			
 
				-    strides.push_back(m.elemsize);
			
 
				-  } else if (m.dims == 3) {
			
 
				-    shape.push_back(m.c);
			
 
				-    shape.push_back(m.h);
			
 
				-    shape.push_back(m.w);
			
 
				-    strides.push_back(m.cstep * m.elemsize);
			
 
				-    strides.push_back(m.w * m.elemsize);
			
 
				-    strides.push_back(m.elemsize);
			
 
				-  } else if (m.dims == 4) {
			
 
				-    shape.push_back(m.c);
			
 
				-    shape.push_back(m.d);
			
 
				-    shape.push_back(m.h);
			
 
				-    shape.push_back(m.w);
			
 
				-    strides.push_back(m.cstep * m.elemsize);
			
 
				-    strides.push_back(m.w * m.h * m.elemsize);
			
 
				-    strides.push_back(m.w * m.elemsize);
			
 
				-    strides.push_back(m.elemsize);
			
 
				-  }
			
 
				-
			
 
				-  auto keep_mat_alive = new KeepMatAlive(m);
			
 
				-  py::capsule handle(keep_mat_alive, [](void *p) {
			
 
				-    delete reinterpret_cast<KeepMatAlive *>(p);
			
 
				-  });
			
 
				-
			
 
				-  return py::array_t<float>(shape, strides, (float *)m.data, handle);
			
 
				-}
			
 
				-
			
 
				-ncnn::Mat ArrayToMat(py::array array) {
			
 
				-  py::buffer_info info = array.request();
			
 
				-  size_t elemsize = info.itemsize;
			
 
				-
			
 
				-  ncnn::Mat ans;
			
 
				-
			
 
				-  if (info.ndim == 1) {
			
 
				-    ans = ncnn::Mat((int)info.shape[0], info.ptr, elemsize);
			
 
				-  } else if (info.ndim == 2) {
			
 
				-    ans = ncnn::Mat((int)info.shape[1], (int)info.shape[0], info.ptr, elemsize);
			
 
				-  } else if (info.ndim == 3) {
			
 
				-    ans = ncnn::Mat((int)info.shape[2], (int)info.shape[1], (int)info.shape[0],
			
 
				-                    info.ptr, elemsize);
			
 
				-
			
 
				-    // in ncnn, buffer to construct ncnn::Mat need align to ncnn::alignSize
			
 
				-    // with (w * h * elemsize, 16) / elemsize, but the buffer from numpy not
			
 
				-    // so we set the cstep as numpy's cstep
			
 
				-    ans.cstep = (int)info.shape[2] * (int)info.shape[1];
			
 
				-  } else if (info.ndim == 4) {
			
 
				-    ans = ncnn::Mat((int)info.shape[3], (int)info.shape[2], (int)info.shape[1],
			
 
				-                    (int)info.shape[0], info.ptr, elemsize);
			
 
				-
			
 
				-    // in ncnn, buffer to construct ncnn::Mat need align to ncnn::alignSize
			
 
				-    // with (w * h * d elemsize, 16) / elemsize, but the buffer from numpy not
			
 
				-    // so we set the cstep as numpy's cstep
			
 
				-    ans.cstep = (int)info.shape[3] * (int)info.shape[2] * (int)info.shape[1];
			
 
				-  }
			
 
				-
			
 
				-  return ans;
			
 
				-}
			
 
				-
			
 
				-}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/mat-util.h
+++ b/sherpa-ncnn/python/csrc/mat-util.h
@@ -1,37 +0,0 @@
 
				-/**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				- *
			
 
				- * See LICENSE for clarification regarding multiple authors
			
 
				- *
			
 
				- * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				- * you may not use this file except in compliance with the License.
			
 
				- * You may obtain a copy of the License at
			
 
				- *
			
 
				- *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				- *
			
 
				- * Unless required by applicable law or agreed to in writing, software
			
 
				- * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				- * See the License for the specific language governing permissions and
			
 
				- * limitations under the License.
			
 
				- */
			
 
				-
			
 
				-#ifndef SHERPA_NCNN_PYTHON_CSRC_MAT_UTIL_H_
			
 
				-#define SHERPA_NCNN_PYTHON_CSRC_MAT_UTIL_H_
			
 
				-
			
 
				-#include "mat.h"
			
 
				-#include "sherpa-ncnn/python/csrc/sherpa-ncnn.h"
			
 
				-
			
 
				-namespace sherpa_ncnn {
			
 
				-
			
 
				-// Convert a ncnn::Mat to a numpy array. Data is shared.
			
 
				-//
			
 
				-// @param m It should be a float unpacked matrix
			
 
				-py::array_t<float> MatToArray(ncnn::Mat m);
			
 
				-
			
 
				-// convert an array to a ncnn::Mat
			
 
				-ncnn::Mat ArrayToMat(py::array array);
			
 
				-
			
 
				-}  // namespace sherpa_ncnn
			
 
				-
			
 
				-#endif  // SHERPA_NCNN_PYTHON_CSRC_MODEL_UTIL_H_
			
--- a/sherpa-ncnn/python/csrc/model.cc
+++ b/sherpa-ncnn/python/csrc/model.cc
@@ -22,7 +22,6 @@
 
				 #include <string>
			
 
				 
			
 
				 #include "sherpa-ncnn/csrc/model.h"
			
 
				-#include "sherpa-ncnn/python/csrc/mat-util.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
@@ -48,6 +47,8 @@ Args:
 
				     Path to joiner.ncnn.bin.
			
 
				   num_threads:
			
 
				     Number of threads to use for neural network computation.
			
 
				+  tokens:
			
 
				+    Path to tokens.txt
			
 
				 )doc";
			
 
				 
			
 
				 static void PybindModelConfig(py::module *m) {
			
@@ -58,8 +59,8 @@ static void PybindModelConfig(py::module *m) {
 
				                        const std::string &decoder_param,
			
 
				                        const std::string &decoder_bin,
			
 
				                        const std::string &joiner_param,
			
 
				-                       const std::string &joiner_bin,
			
 
				-                       int32_t num_threads) -> std::unique_ptr<PyClass> {
			
 
				+                       const std::string &joiner_bin, int32_t num_threads,
			
 
				+                       const std::string &tokens) -> std::unique_ptr<PyClass> {
			
 
				              auto ans = std::make_unique<PyClass>();
			
 
				              ans->encoder_param = encoder_param;
			
 
				              ans->encoder_bin = encoder_bin;
			
@@ -67,6 +68,7 @@ static void PybindModelConfig(py::module *m) {
 
				              ans->decoder_bin = decoder_bin;
			
 
				              ans->joiner_param = joiner_param;
			
 
				              ans->joiner_bin = joiner_bin;
			
 
				+             ans->tokens = tokens;
			
 
				 
			
 
				              ans->use_vulkan_compute = false;
			
 
				 
			
@@ -79,66 +81,9 @@ static void PybindModelConfig(py::module *m) {
 
				            py::arg("encoder_param"), py::arg("encoder_bin"),
			
 
				            py::arg("decoder_param"), py::arg("decoder_bin"),
			
 
				            py::arg("joiner_param"), py::arg("joiner_bin"),
			
 
				-           py::arg("num_threads"), kModelConfigInitDoc);
			
 
				+           py::arg("num_threads"), py::arg("tokens"), kModelConfigInitDoc);
			
 
				 }
			
 
				 
			
 
				-void PybindModel(py::module *m) {
			
 
				-  PybindModelConfig(m);
			
 
				-
			
 
				-  using PyClass = Model;
			
 
				-  py::class_<PyClass>(*m, "Model")
			
 
				-      .def_static("create", &PyClass::Create, py::arg("config"))
			
 
				-      .def(
			
 
				-          "run_encoder",
			
 
				-          [](PyClass &self, py::array _features,
			
 
				-             const std::vector<py::array> &_states)
			
 
				-              -> std::pair<py::array, std::vector<py::array>> {
			
 
				-            ncnn::Mat features = ArrayToMat(_features);
			
 
				-
			
 
				-            std::vector<ncnn::Mat> states;
			
 
				-            states.reserve(_states.size());
			
 
				-            for (const auto &s : _states) {
			
 
				-              states.push_back(ArrayToMat(s));
			
 
				-            }
			
 
				-
			
 
				-            ncnn::Mat encoder_out;
			
 
				-            std::vector<ncnn::Mat> _next_states;
			
 
				-
			
 
				-            std::tie(encoder_out, _next_states) =
			
 
				-                self.RunEncoder(features, states);
			
 
				-
			
 
				-            std::vector<py::array> next_states;
			
 
				-            next_states.reserve(_next_states.size());
			
 
				-            for (const auto &m : _next_states) {
			
 
				-              next_states.push_back(MatToArray(m));
			
 
				-            }
			
 
				-
			
 
				-            return std::make_pair(MatToArray(encoder_out), next_states);
			
 
				-          },
			
 
				-          py::arg("features"), py::arg("states"))
			
 
				-      .def(
			
 
				-          "run_decoder",
			
 
				-          [](PyClass &self, py::array _decoder_input) -> py::array {
			
 
				-            ncnn::Mat decoder_input = ArrayToMat(_decoder_input);
			
 
				-            ncnn::Mat decoder_out = self.RunDecoder(decoder_input);
			
 
				-            return MatToArray(decoder_out);
			
 
				-          },
			
 
				-          py::arg("decoder_input"))
			
 
				-      .def(
			
 
				-          "run_joiner",
			
 
				-          [](PyClass &self, py::array _encoder_out,
			
 
				-             py::array _decoder_out) -> py::array {
			
 
				-            ncnn::Mat encoder_out = ArrayToMat(_encoder_out);
			
 
				-            ncnn::Mat decoder_out = ArrayToMat(_decoder_out);
			
 
				-            ncnn::Mat joiner_out = self.RunJoiner(encoder_out, decoder_out);
			
 
				-
			
 
				-            return MatToArray(joiner_out);
			
 
				-          },
			
 
				-          py::arg("encoder_out"), py::arg("decoder_out"))
			
 
				-      .def_property_readonly("context_size", &PyClass::ContextSize)
			
 
				-      .def_property_readonly("blank_id", &PyClass::BlankId)
			
 
				-      .def_property_readonly("segment", &PyClass::Segment)
			
 
				-      .def_property_readonly("offset", &PyClass::Offset);
			
 
				-}
			
 
				+void PybindModel(py::module *m) { PybindModelConfig(m); }
			
 
				 
			
 
				 }  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/recognizer.cc
+++ b/sherpa-ncnn/python/csrc/recognizer.cc
@@ -0,0 +1,104 @@
 
				+/**
			
 
				+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#include "sherpa-ncnn/python/csrc/recognizer.h"
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+
			
 
				+#include "sherpa-ncnn/csrc/recognizer.h"
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+
			
 
				+static constexpr const char *kDecoderConfigInitDoc = R"doc(
			
 
				+Constructor for DecoderConfig.
			
 
				+
			
 
				+Args:
			
 
				+  method:
			
 
				+    Decoding method. Supported values are: greedy_search, modified_beam_search.
			
 
				+  num_active_paths:
			
 
				+    Used only when method is modified_beam_search. It specifies the number of
			
 
				+    actives paths during beam search.
			
 
				+  enable_endpoint:
			
 
				+    True to enable endpoint detection. False to disable endpoint detection.
			
 
				+  endpoint_config:
			
 
				+    Used only when ``enable_endpoint`` is True.
			
 
				+)doc";
			
 
				+
			
 
				+static void PybindRecognitionResult(py::module *m) {
			
 
				+  using PyClass = RecognitionResult;
			
 
				+  py::class_<PyClass>(*m, "RecognitionResult")
			
 
				+      .def_property_readonly(
			
 
				+          "text", [](PyClass &self) -> std::string { return self.text; });
			
 
				+}
			
 
				+
			
 
				+static void PybindDecoderConfig(py::module *m) {
			
 
				+  using PyClass = DecoderConfig;
			
 
				+  py::class_<PyClass>(*m, "DecoderConfig")
			
 
				+      .def(py::init<const std::string &, int32_t, bool,
			
 
				+                    const EndpointConfig &>(),
			
 
				+           py::arg("method"), py::arg("num_active_paths"),
			
 
				+           py::arg("enable_endpoint"), py::arg("endpoint_config"),
			
 
				+           kDecoderConfigInitDoc)
			
 
				+      .def("__str__", &PyClass::ToString)
			
 
				+      .def_property_readonly("method",
			
 
				+                             [](const PyClass &self) { return self.method; })
			
 
				+      .def_property_readonly(
			
 
				+          "num_active_paths",
			
 
				+          [](const PyClass &self) { return self.num_active_paths; })
			
 
				+      .def_property_readonly(
			
 
				+          "enable_endpoint",
			
 
				+          [](const PyClass &self) { return self.enable_endpoint; })
			
 
				+      .def_property_readonly("endpoint_config", [](const PyClass &self) {
			
 
				+        return self.endpoint_config;
			
 
				+      });
			
 
				+}
			
 
				+
			
 
				+void PybindRecognizer(py::module *m) {
			
 
				+  PybindRecognitionResult(m);
			
 
				+  PybindDecoderConfig(m);
			
 
				+
			
 
				+  using PyClass = Recognizer;
			
 
				+  py::class_<PyClass>(*m, "Recognizer")
			
 
				+      .def(py::init([](const DecoderConfig &decoder_config,
			
 
				+                       const ModelConfig &model_config,
			
 
				+                       float sample_rate = 16000) -> std::unique_ptr<PyClass> {
			
 
				+             knf::FbankOptions fbank_opts;
			
 
				+             fbank_opts.frame_opts.dither = 0;
			
 
				+             fbank_opts.frame_opts.snip_edges = false;
			
 
				+             fbank_opts.frame_opts.samp_freq = sample_rate;
			
 
				+             fbank_opts.mel_opts.num_bins = 80;
			
 
				+
			
 
				+             return std::make_unique<PyClass>(decoder_config, model_config,
			
 
				+                                              fbank_opts);
			
 
				+           }),
			
 
				+           py::arg("decoder_config"), py::arg("model_config"),
			
 
				+           py::arg("sample_rate") = 16000)
			
 
				+      .def("accept_waveform",
			
 
				+           [](PyClass &self, float sample_rate, py::array_t<float> waveform) {
			
 
				+             self.AcceptWaveform(sample_rate, waveform.data(), waveform.size());
			
 
				+           })
			
 
				+      .def("input_finished", &PyClass::InputFinished)
			
 
				+      .def("decode", &PyClass::Decode)
			
 
				+      .def_property_readonly("result",
			
 
				+                             [](PyClass &self) { return self.GetResult(); })
			
 
				+      .def("is_endpoint", &PyClass::IsEndpoint)
			
 
				+      .def("reset", &PyClass::Reset);
			
 
				+}
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/csrc/recognizer.h
+++ b/sherpa-ncnn/python/csrc/recognizer.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				  *
			
 
				  * See LICENSE for clarification regarding multiple authors
			
 
				  *
			
@@ -16,15 +16,15 @@
 
				  * limitations under the License.
			
 
				  */
			
 
				 
			
 
				-#ifndef SHERPA_NCNN_PYTHON_CSRC_FEATURES_H_
			
 
				-#define SHERPA_NCNN_PYTHON_CSRC_FEATURES_H_
			
 
				+#ifndef SHERPA_NCNN_PYTHON_CSRC_RECOGNIZER_H_
			
 
				+#define SHERPA_NCNN_PYTHON_CSRC_RECOGNIZER_H_
			
 
				 
			
 
				 #include "sherpa-ncnn/python/csrc/sherpa-ncnn.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-void PybindFeatures(py::module *m);
			
 
				+void PybindRecognizer(py::module *m);
			
 
				 
			
 
				 }  // namespace sherpa_ncnn
			
 
				 
			
 
				-#endif  // SHERPA_NCNN_PYTHON_CSRC_FEATURES_H_
			
 
				+#endif  // SHERPA_NCNN_PYTHON_CSRC_RECOGNIZER_H_
			
--- a/sherpa-ncnn/python/csrc/sherpa-ncnn.cc
+++ b/sherpa-ncnn/python/csrc/sherpa-ncnn.cc
@@ -18,20 +18,18 @@
 
				 
			
 
				 #include "sherpa-ncnn/python/csrc/sherpa-ncnn.h"
			
 
				 
			
 
				-#include "sherpa-ncnn/python/csrc/decode.h"
			
 
				-#include "sherpa-ncnn/python/csrc/features.h"
			
 
				+#include "sherpa-ncnn/python/csrc/endpoint.h"
			
 
				 #include "sherpa-ncnn/python/csrc/model.h"
			
 
				+#include "sherpa-ncnn/python/csrc/recognizer.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				 PYBIND11_MODULE(_sherpa_ncnn, m) {
			
 
				   m.doc() = "pybind11 binding of sherpa-ncnn";
			
 
				 
			
 
				+  PybindEndpoint(&m);
			
 
				   PybindModel(&m);
			
 
				-
			
 
				-  PybindFeatures(&m);
			
 
				-
			
 
				-  PybindDecode(&m);
			
 
				+  PybindRecognizer(&m);
			
 
				 }
			
 
				 
			
 
				 }  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/python/sherpa_ncnn/__init__.py
+++ b/sherpa-ncnn/python/sherpa_ncnn/__init__.py
@@ -1,3 +1 @@
 
				-from _sherpa_ncnn import FeatureExtractor, Model, ModelConfig, greedy_search
			
 
				-
			
 
				 from .recognizer import Recognizer
			
--- a/sherpa-ncnn/python/sherpa_ncnn/recognizer.py
+++ b/sherpa-ncnn/python/sherpa_ncnn/recognizer.py
@@ -1,24 +1,19 @@
 
				 from pathlib import Path
			
 
				 
			
 
				 import numpy as np
			
 
				-from _sherpa_ncnn import FeatureExtractor, Model, ModelConfig, greedy_search
			
 
				+from _sherpa_ncnn import (
			
 
				+    DecoderConfig,
			
 
				+    EndpointConfig,
			
 
				+    EndpointRule,
			
 
				+    ModelConfig,
			
 
				+)
			
 
				+from _sherpa_ncnn import Recognizer as _Recognizer
			
 
				 
			
 
				 
			
 
				 def _assert_file_exists(f: str):
			
 
				     assert Path(f).is_file(), f"{f} does not exist"
			
 
				 
			
 
				 
			
 
				-def _read_tokens(tokens):
			
 
				-    sym_table = {}
			
 
				-    with open(tokens, "r", encoding="utf-8") as f:
			
 
				-        for line in f:
			
 
				-            sym, i = line.split()
			
 
				-            sym = sym.replace("▁", " ")
			
 
				-            sym_table[int(i)] = sym
			
 
				-
			
 
				-    return sym_table
			
 
				-
			
 
				-
			
 
				 class Recognizer(object):
			
 
				     """A class for streaming speech recognition.
			
 
				 
			
@@ -88,6 +83,12 @@ class Recognizer(object):
 
				         joiner_param: str,
			
 
				         joiner_bin: str,
			
 
				         num_threads: int = 4,
			
 
				+        decoding_method: str = "greedy_search",
			
 
				+        num_active_paths: int = 4,
			
 
				+        enable_endpoint_detection: bool = False,
			
 
				+        rule1_min_trailing_silence: int = 2.4,
			
 
				+        rule2_min_trailing_silence: int = 1.2,
			
 
				+        rule3_min_utterance_length: int = 20,
			
 
				     ):
			
 
				         """
			
 
				         Please refer to
			
@@ -101,6 +102,7 @@ class Recognizer(object):
 
				             columns::
			
 
				 
			
 
				                 symbol integer_id
			
 
				+
			
 
				           encoder_param:
			
 
				             Path to ``encoder.ncnn.param``.
			
 
				           encoder_bin:
			
@@ -115,6 +117,28 @@ class Recognizer(object):
 
				             Path to ``joiner.ncnn.bin``.
			
 
				           num_threads:
			
 
				             Number of threads for neural network computation.
			
 
				+          decoding_method:
			
 
				+            Valid decoding methods are: greedy_search, modified_beam_search.
			
 
				+          num_active_paths:
			
 
				+            Used only when decoding_method is modified_beam_search. Its value
			
 
				+            is ignored when decoding_method is greedy_search. It specifies
			
 
				+            the maximum number of paths to use in beam search.
			
 
				+          enable_endpoint_detection:
			
 
				+            True to enable endpoint detection. False to disable endpoint
			
 
				+            detection.
			
 
				+          rule1_min_trailing_silence:
			
 
				+            Used only when enable_endpoint_detection is True. If the duration
			
 
				+            of trailing silence in seconds is larger than this value, we assume
			
 
				+            an endpoint is detected.
			
 
				+          rule2_min_trailing_silence:
			
 
				+            Used only when enable_endpoint_detection is True. If we have decoded
			
 
				+            something that is nonsilence and if the duration of trailing silence
			
 
				+            in seconds is larger than this value, we assume an endpoint is
			
 
				+            detected.
			
 
				+          rule3_min_utterance_length:
			
 
				+            Used only when enable_endpoint_detection is True. If the utterance
			
 
				+            length in seconds is larger than this value, we assume an endpoint
			
 
				+            is detected.
			
 
				         """
			
 
				         _assert_file_exists(tokens)
			
 
				         _assert_file_exists(encoder_param)
			
@@ -125,8 +149,10 @@ class Recognizer(object):
 
				         _assert_file_exists(joiner_bin)
			
 
				 
			
 
				         assert num_threads > 0, num_threads
			
 
				-
			
 
				-        self.sym_table = _read_tokens(tokens)
			
 
				+        assert decoding_method in (
			
 
				+            "greedy_search",
			
 
				+            "modified_beam_search",
			
 
				+        ), decoding_method
			
 
				 
			
 
				         model_config = ModelConfig(
			
 
				             encoder_param=encoder_param,
			
@@ -136,23 +162,30 @@ class Recognizer(object):
 
				             joiner_param=joiner_param,
			
 
				             joiner_bin=joiner_bin,
			
 
				             num_threads=num_threads,
			
 
				+            tokens=tokens,
			
 
				         )
			
 
				 
			
 
				-        self.model = Model.create(model_config)
			
 
				-        self.sample_rate = 16000
			
 
				-
			
 
				-        self.feature_extractor = FeatureExtractor(
			
 
				-            feature_dim=80,
			
 
				-            sample_rate=self.sample_rate,
			
 
				+        endpoint_config = EndpointConfig(
			
 
				+            rule1_min_trailing_silence=rule1_min_trailing_silence,
			
 
				+            rule2_min_trailing_silence=rule2_min_trailing_silence,
			
 
				+            rule3_min_utterance_length=rule3_min_utterance_length,
			
 
				         )
			
 
				 
			
 
				-        self.num_processed = 0  # number of processed feature frames so far
			
 
				-        self.states = []  # model state
			
 
				+        decoder_config = DecoderConfig(
			
 
				+            method=decoding_method,
			
 
				+            num_active_paths=num_active_paths,
			
 
				+            enable_endpoint=enable_endpoint_detection,
			
 
				+            endpoint_config=endpoint_config,
			
 
				+        )
			
 
				 
			
 
				-        self.hyp = [0] * self.model.context_size  # initial hypothesis
			
 
				+        # all of our current models are using 16 kHz audio samples
			
 
				+        self.sample_rate = 16000
			
 
				 
			
 
				-        decoder_input = np.array(self.hyp, dtype=np.int32)
			
 
				-        self.decoder_out = self.model.run_decoder(decoder_input)
			
 
				+        self.recognizer = _Recognizer(
			
 
				+            decoder_config=decoder_config,
			
 
				+            model_config=model_config,
			
 
				+            sample_rate=self.sample_rate,
			
 
				+        )
			
 
				 
			
 
				     def accept_waveform(self, sample_rate: float, waveform: np.array):
			
 
				         """Decode audio samples.
			
@@ -165,37 +198,18 @@ class Recognizer(object):
 
				             range ``[-1, 1]``.
			
 
				         """
			
 
				         assert sample_rate == self.sample_rate, (sample_rate, self.sample_rate)
			
 
				-        self.feature_extractor.accept_waveform(sample_rate, waveform)
			
 
				-
			
 
				-        self._decode()
			
 
				+        self.recognizer.accept_waveform(sample_rate, waveform)
			
 
				+        self.recognizer.decode()
			
 
				 
			
 
				     def input_finished(self):
			
 
				         """Signal that no more audio samples are available."""
			
 
				-        self.feature_extractor.input_finished()
			
 
				-        self._decode()
			
 
				+        self.recognizer.input_finished()
			
 
				+        self.recognizer.decode()
			
 
				 
			
 
				     @property
			
 
				     def text(self):
			
 
				-        context_size = self.model.context_size
			
 
				-        text = [self.sym_table[token] for token in self.hyp[context_size:]]
			
 
				-        return "".join(text)
			
 
				-
			
 
				-    def _decode(self):
			
 
				-        segment = self.model.segment
			
 
				-        offset = self.model.offset
			
 
				-
			
 
				-        while self.feature_extractor.num_frames_ready - self.num_processed >= segment:
			
 
				-            features = self.feature_extractor.get_frames(self.num_processed, segment)
			
 
				-            self.num_processed += offset
			
 
				+        return self.recognizer.result.text
			
 
				 
			
 
				-            encoder_out, self.states = self.model.run_encoder(
			
 
				-                features=features,
			
 
				-                states=self.states,
			
 
				-            )
			
 
				-
			
 
				-            self.decoder_out, self.hyp = greedy_search(
			
 
				-                model=self.model,
			
 
				-                encoder_out=encoder_out,
			
 
				-                decoder_out=self.decoder_out,
			
 
				-                hyp=self.hyp,
			
 
				-            )
			
 
				+    @property
			
 
				+    def is_endpoint(self):
			
 
				+        return self.recognizer.is_endpoint()
		`@@ -0,0 +1 @@`
		`+exclude_files=generate-int8-scale-table.cc`