Procházet zdrojové kódy

Remove max_feature_vectors (#149)

Fangjun Kuang před 2 roky
rodič
revize
f2e65da473

+ 1 - 1
.github/scripts/Main.kt

@@ -4,7 +4,7 @@ import android.content.res.AssetManager
 
 fun main() {
     val featConfig =
-        getFeatureExtractorConfig(sampleRate = 16000.0f, featureDim = 80, maxFeatureVectors = -1)
+        getFeatureExtractorConfig(sampleRate = 16000.0f, featureDim = 80)
     val modelConfig = getModelConfig(type = 1, useGPU = false)!!
     val decoderConfig = getDecoderConfig(method = "greedy_search", numActivePaths = 4)
 

+ 1 - 2
android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/MainActivity.kt

@@ -179,8 +179,7 @@ class MainActivity : AppCompatActivity() {
     private fun initModel() {
         val featConfig = getFeatureExtractorConfig(
             sampleRate = 16000.0f,
-            featureDim = 80,
-            maxFeatureVectors = 1 * 100 // cache 1 second of feature frames
+            featureDim = 80
         )
         //Please change the argument "type" if you use a different model
         val modelConfig = getModelConfig(type = 1, useGPU = false)!!

+ 1 - 4
android/SherpaNcnn/app/src/main/java/com/k2fsa/sherpa/ncnn/SherpaNcnn.kt

@@ -5,7 +5,6 @@ import android.content.res.AssetManager
 data class FeatureExtractorConfig(
     var sampleRate: Float,
     var featureDim: Int,
-    var maxFeatureVectors: Int,
 )
 
 
@@ -87,13 +86,11 @@ class SherpaNcnn(
 
 fun getFeatureExtractorConfig(
     sampleRate: Float,
-    featureDim: Int,
-    maxFeatureVectors: Int
+    featureDim: Int
 ): FeatureExtractorConfig {
     return FeatureExtractorConfig(
         sampleRate = sampleRate,
         featureDim = featureDim,
-        maxFeatureVectors = maxFeatureVectors,
     )
 }
 

+ 0 - 1
c-api-examples/decode-file-c-api.c

@@ -73,7 +73,6 @@ int32_t main(int32_t argc, char *argv[]) {
 
   config.feat_config.sampling_rate = 16000;
   config.feat_config.feature_dim = 80;
-  config.feat_config.max_feature_vectors = 2 * 100;  // 2 seconds cache
 
   SherpaNcnnRecognizer *recognizer = CreateRecognizer(&config);
 

+ 11 - 9
cmake/kaldi-native-fbank.cmake

@@ -2,23 +2,25 @@ function(download_kaldi_native_fbank)
   include(FetchContent)
 
   # Please also change ../pack-for-embedded-systems.sh
-  set(kaldi_native_fbank_URL  "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.13.tar.gz")
-  set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-ncnn-cmake-deps/resolve/main/kaldi-native-fbank-1.13.tar.gz")
-  set(kaldi_native_fbank_HASH "SHA256=1f4d228f9fe3e3e9f92a74a7eecd2489071a03982e4ba6d7c70fc5fa7444df57")
+  set(kaldi_native_fbank_URL  "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.14.tar.gz")
+  set(kaldi_native_fbank_URL2 "https://huggingface.co/csukuangfj/sherpa-ncnn-cmake-deps/resolve/main/kaldi-native-fbank-1.14.tar.gz")
+  set(kaldi_native_fbank_HASH "SHA256=6a66638a111d3ce21fe6f29cbf9ab3dbcae2331c77391bf825927df5cbf2babe")
+
 
   # If you don't have access to the Internet, please download it to your
   # local drive and modify the following line according to your needs.
   set(possible_file_locations
-    $ENV{HOME}/Downloads/kaldi-native-fbank-1.13.tar.gz
-    $ENV{HOME}/asr/kaldi-native-fbank-1.13.tar.gz
-    ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.13.tar.gz
-    ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.13.tar.gz
-    /tmp/kaldi-native-fbank-1.13.tar.gz
+    $ENV{HOME}/Downloads/kaldi-native-fbank-1.14.tar.gz
+    $ENV{HOME}/asr/kaldi-native-fbank-1.14.tar.gz
+    ${PROJECT_SOURCE_DIR}/kaldi-native-fbank-1.14.tar.gz
+    ${PROJECT_BINARY_DIR}/kaldi-native-fbank-1.14.tar.gz
+    /tmp/kaldi-native-fbank-1.14.tar.gz
   )
 
   foreach(f IN LISTS possible_file_locations)
     if(EXISTS ${f})
-      set(kaldi_native_fbank_URL  "file://${f}")
+      set(kaldi_native_fbank_URL  "${f}")
+      file(TO_CMAKE_PATH "${kaldi_native_fbank_URL}" kaldi_native_fbank_URL)
       set(kaldi_native_fbank_URL2)
       break()
     endif()

+ 2 - 1
cmake/ncnn.cmake

@@ -22,7 +22,8 @@ function(download_ncnn)
 
   foreach(f IN LISTS possible_file_locations)
     if(EXISTS ${f})
-      set(ncnn_URL  "file://${f}")
+      set(ncnn_URL  "${f}")
+      file(TO_CMAKE_PATH "${ncnn_URL}" ncnn_URL)
       set(ncnn_URL2)
       break()
     endif()

+ 2 - 1
cmake/portaudio.cmake

@@ -17,7 +17,8 @@ function(download_portaudio)
 
   foreach(f IN LISTS possible_file_locations)
     if(EXISTS ${f})
-      set(portaudio_URL  "file://${f}")
+      set(portaudio_URL  "${f}")
+      file(TO_CMAKE_PATH "${portaudio_URL}" portaudio_URL)
       set(portaudio_URL2)
       break()
     endif()

+ 2 - 1
cmake/pybind11.cmake

@@ -17,7 +17,8 @@ function(download_pybind11)
 
   foreach(f IN LISTS possible_file_locations)
     if(EXISTS ${f})
-      set(pybind11_URL  "file://${f}")
+      set(pybind11_URL  "${f}")
+      file(TO_CMAKE_PATH "${pybind11_URL}" pybind11_URL)
       set(pybind11_URL2)
       break()
     endif()

+ 0 - 1
ffmpeg-examples/sherpa-ncnn-ffmpeg.c

@@ -329,7 +329,6 @@ int main(int argc, char **argv)
 
     config.feat_config.sampling_rate = 16000;
     config.feat_config.feature_dim = 80;
-    config.feat_config.max_feature_vectors = 2 * 100;  // 2 seconds cache
 
     SherpaNcnnRecognizer *recognizer = CreateRecognizer(&config);
 

+ 1 - 2
ios-swift/SherpaNcnn/SherpaNcnn/ViewController.swift

@@ -87,8 +87,7 @@ class ViewController: UIViewController {
         // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
         let featConfig = sherpaNcnnFeatureExtractorConfig(
             sampleRate: 16000,
-            featureDim: 80,
-            maxFeatureVectors: 1*100)
+            featureDim: 80)
 
         let modelConfig = getMultilingualModelConfig2022_12_06()
         // var modelConfig = getMultilingualModelConfig2022_12_06_Int8()

+ 3 - 3
pack-for-embedded-systems.sh

@@ -25,8 +25,8 @@ rm -v sherpa-ncnn-${SHERPA_NCNN_VERSION}.tar.gz
 
 # Please also change ./build-m3axpi.sh
 wget \
-  -O kaldi-native-fbank-1.13.tar.gz \
-  https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.13.tar.gz
+  -O kaldi-native-fbank-1.14.tar.gz \
+  https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.14.tar.gz
 
 wget \
   -O ncnn-sherpa-0.9.tar.gz \
@@ -47,7 +47,7 @@ It should print something like below:
 
 ls -lh \$HOME/asr
 total 24368
--rw-r--r--   1 fangjun  staff    59K Feb  2 17:01 kaldi-native-fbank-1.13.tar.gz
+-rw-r--r--   1 fangjun  staff    59K Feb  2 17:01 kaldi-native-fbank-1.14.tar.gz
 -rw-r--r--   1 fangjun  staff    12M Feb  2 17:01 sherpa-0.9.tar.gz
 drwxr-xr-x  29 fangjun  staff   928B Feb  2 16:05 sherpa-ncnn-${SHERPA_NCNN_VERSION}
 

+ 0 - 1
python-api-examples/speech-recognition-from-microphone.py

@@ -34,7 +34,6 @@ def create_recognizer():
         joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
         joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
         num_threads=4,
-        max_feature_vectors=100,  # cache 1 second of feature frames
     )
     return recognizer
 

+ 0 - 2
sherpa-ncnn/c-api/c-api.cc

@@ -86,8 +86,6 @@ SherpaNcnnRecognizer *CreateRecognizer(
 
   config.feat_config.sampling_rate = in_config->feat_config.sampling_rate;
   config.feat_config.feature_dim = in_config->feat_config.feature_dim;
-  config.feat_config.max_feature_vectors =
-      in_config->feat_config.max_feature_vectors;
 
   auto ans = new SherpaNcnnRecognizer;
   ans->recognizer = std::make_unique<sherpa_ncnn::Recognizer>(config);

+ 0 - 4
sherpa-ncnn/c-api/c-api.h

@@ -85,10 +85,6 @@ typedef struct SherpaNcnnFeatureExtractorConfig {
   // feature dimension. Must match the one expected by the model.
   // For instance, it should be 80 for models from icefall.
   int32_t feature_dim;
-
-  // It specifies how many feature frames to cache.
-  // Use -1 to cache all past feature frames.
-  int32_t max_feature_vectors;
 } SherpaNcnnFeatureExtractorConfig;
 
 typedef struct SherpaNcnnRecognizerConfig {

+ 17 - 8
sherpa-ncnn/csrc/features.cc

@@ -33,8 +33,7 @@ std::string FeatureExtractorConfig::ToString() const {
 
   os << "FeatureExtractorConfig(";
   os << "sampling_rate=" << sampling_rate << ", ";
-  os << "feature_dim=" << feature_dim << ", ";
-  os << "max_feature_vectors=" << max_feature_vectors << ")";
+  os << "feature_dim=" << feature_dim << ")";
 
   return os.str();
 }
@@ -46,8 +45,6 @@ class FeatureExtractor::Impl {
     opts_.frame_opts.snip_edges = false;
     opts_.frame_opts.samp_freq = config.sampling_rate;
 
-    opts_.frame_opts.max_feature_vectors = config.max_feature_vectors;
-
     opts_.mel_opts.num_bins = config.feature_dim;
 
     fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
@@ -112,12 +109,21 @@ class FeatureExtractor::Impl {
     return fbank_->IsLastFrame(frame);
   }
 
-  ncnn::Mat GetFrames(int32_t frame_index, int32_t n) const {
-    if (frame_index + n > NumFramesReady()) {
-      NCNN_LOGE("%d + %d > %d", frame_index, n, NumFramesReady());
+  ncnn::Mat GetFrames(int32_t frame_index, int32_t n) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (frame_index + n > fbank_->NumFramesReady()) {
+      NCNN_LOGE("%d + %d > %d", frame_index, n, fbank_->NumFramesReady());
+      exit(-1);
+    }
+
+    int32_t discard_num = frame_index - last_frame_index_;
+    if (discard_num < 0) {
+      NCNN_LOGE("last_frame_index_: %d, frame_index_: %d", last_frame_index_,
+                frame_index);
       exit(-1);
     }
-    std::lock_guard<std::mutex> lock(mutex_);
+
+    fbank_->Pop(discard_num);
 
     int32_t feature_dim = fbank_->Dim();
     ncnn::Mat features;
@@ -128,6 +134,8 @@ class FeatureExtractor::Impl {
       std::copy(f, f + feature_dim, features.row(i));
     }
 
+    last_frame_index_ = frame_index;
+
     return features;
   }
 
@@ -136,6 +144,7 @@ class FeatureExtractor::Impl {
   knf::FbankOptions opts_;
   mutable std::mutex mutex_;
   std::unique_ptr<LinearResample> resampler_;
+  int32_t last_frame_index_ = 0;
 };
 
 FeatureExtractor::FeatureExtractor(const FeatureExtractorConfig &config)

+ 0 - 4
sherpa-ncnn/csrc/features.h

@@ -32,10 +32,6 @@ struct FeatureExtractorConfig {
   int32_t sampling_rate = 16000;
   int32_t feature_dim = 80;
 
-  // 100 hundred frames per second
-  // It specifies how many past frames to cache
-  int32_t max_feature_vectors = 10 * 100;
-
   std::string ToString() const;
 };
 

+ 0 - 2
sherpa-ncnn/csrc/generate-int8-scale-table.cc

@@ -577,7 +577,6 @@ int QuantNet::quantize_KL(const std::vector<std::string> &wave_filenames) {
     sherpa_ncnn::FeatureExtractorConfig config;
     config.sampling_rate = 16000;
     config.feature_dim = 80;
-    config.max_feature_vectors = -1;
     sherpa_ncnn::FeatureExtractor feature_extractor(config);
     feature_extractor.AcceptWaveform(expected_sampling_rate, samples.data(),
                                      samples.size());
@@ -713,7 +712,6 @@ int QuantNet::quantize_KL(const std::vector<std::string> &wave_filenames) {
     sherpa_ncnn::FeatureExtractorConfig config;
     config.sampling_rate = 16000;
     config.feature_dim = 80;
-    config.max_feature_vectors = -1;
     sherpa_ncnn::FeatureExtractor feature_extractor(config);
     feature_extractor.AcceptWaveform(expected_sampling_rate, samples.data(),
                                      samples.size());

+ 1 - 4
sherpa-ncnn/csrc/sherpa-ncnn-alsa.cc

@@ -120,10 +120,7 @@ as the device_name.
   config.feat_config.sampling_rate = expected_sampling_rate;
   config.feat_config.feature_dim = 80;
 
-  // cache 2 seconds of features
-  config.feat_config.max_feature_vectors = 2 * 100;
-
-  fprintf(stderr, "%s\n", decoder_conf.ToString().c_str());
+  fprintf(stderr, "%s\n", config.ToString().c_str());
 
   sherpa_ncnn::Recognizer recognizer(config);
 

+ 0 - 3
sherpa-ncnn/csrc/sherpa-ncnn-microphone.cc

@@ -107,9 +107,6 @@ for a list of pre-trained models to download.
   config.feat_config.sampling_rate = expected_sampling_rate;
   config.feat_config.feature_dim = 80;
 
-  // cache 2 seconds of features
-  config.feat_config.max_feature_vectors = 2 * 100;
-  //
   fprintf(stderr, "%s\n", config.ToString().c_str());
 
   sherpa_ncnn::Recognizer recognizer(config);

+ 0 - 1
sherpa-ncnn/csrc/sherpa-ncnn.cc

@@ -76,7 +76,6 @@ for a list of pre-trained models to download.
 
   config.feat_config.sampling_rate = expected_sampling_rate;
   config.feat_config.feature_dim = 80;
-  config.feat_config.max_feature_vectors = -1;  // for non-streaming
 
   sherpa_ncnn::Recognizer recognizer(config);
 

+ 0 - 3
sherpa-ncnn/jni/jni.cc

@@ -101,9 +101,6 @@ static FeatureExtractorConfig GetFeatureExtractorConfig(JNIEnv *env,
   fid = env->GetFieldID(feat_config_cls, "featureDim", "I");
   ans.feature_dim = env->GetIntField(feat_config, fid);
 
-  fid = env->GetFieldID(feat_config_cls, "maxFeatureVectors", "I");
-  ans.max_feature_vectors = env->GetIntField(feat_config, fid);
-
   return ans;
 }
 

+ 2 - 6
sherpa-ncnn/python/csrc/features.cc

@@ -27,19 +27,15 @@ namespace sherpa_ncnn {
 void PybindFeatures(py::module *m) {
   using PyClass = FeatureExtractorConfig;
   py::class_<PyClass>(*m, "FeatureExtractorConfig")
-      .def(py::init([](int32_t sampling_rate, int32_t feature_dim,
-                       int32_t max_feature_vectors) {
+      .def(py::init([](int32_t sampling_rate, int32_t feature_dim) {
              auto ans = std::make_unique<PyClass>();
              ans->sampling_rate = sampling_rate;
              ans->feature_dim = feature_dim;
-             ans->max_feature_vectors = max_feature_vectors;
              return ans;
            }),
-           py::arg("sampling_rate"), py::arg("feature_dim"),
-           py::arg("max_feature_vectors"))
+           py::arg("sampling_rate"), py::arg("feature_dim"))
       .def_readwrite("sampling_rate", &PyClass::sampling_rate)
       .def_readwrite("feature_dim", &PyClass::feature_dim)
-      .def_readwrite("max_feature_vectors", &PyClass::max_feature_vectors)
       .def("__str__", &PyClass::ToString);
 }
 

+ 0 - 5
sherpa-ncnn/python/sherpa_ncnn/recognizer.py

@@ -90,7 +90,6 @@ class Recognizer(object):
         rule1_min_trailing_silence: int = 2.4,
         rule2_min_trailing_silence: int = 1.2,
         rule3_min_utterance_length: int = 20,
-        max_feature_vectors: int = -1,
         model_sample_rate: int = 16000,
     ):
         """
@@ -142,9 +141,6 @@ class Recognizer(object):
             Used only when enable_endpoint_detection is True. If the utterance
             length in seconds is larger than this value, we assume an endpoint
             is detected.
-          max_feature_vectors:
-            It specifies the number of feature frames to cache. Use -1
-            to cache all processed frames
           model_sample_rate:
             Sample rate expected by the model
         """
@@ -164,7 +160,6 @@ class Recognizer(object):
         feat_config = FeatureExtractorConfig(
             sampling_rate=model_sample_rate,
             feature_dim=80,
-            max_feature_vectors=-1,
         )
 
         model_config = ModelConfig(

+ 2 - 4
swift-api-examples/SherpaNcnn.swift

@@ -75,13 +75,11 @@ func sherpaNcnnModelConfig(
 
 func sherpaNcnnFeatureExtractorConfig(
     sampleRate: Float,
-    featureDim: Int,
-    maxFeatureVectors: Int
+    featureDim: Int
 )-> SherpaNcnnFeatureExtractorConfig {
     return SherpaNcnnFeatureExtractorConfig(
         sampling_rate: sampleRate,
-        feature_dim: Int32(featureDim),
-        max_feature_vectors: Int32(maxFeatureVectors))
+        feature_dim: Int32(featureDim))
 }
 
 /// Create an instance of SherpaNcnnDecoderConfig

+ 1 - 2
swift-api-examples/decode-file.swift

@@ -28,8 +28,7 @@ func run() {
 
     let featConfig = sherpaNcnnFeatureExtractorConfig(
         sampleRate: 16000,
-        featureDim: 80,
-        maxFeatureVectors: -1
+        featureDim: 80
     )
 
     let modelConfig = sherpaNcnnModelConfig(