há 2 anos atrás · c98d2999e2
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,9 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
				 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
			
 
				 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
			
 
				 
			
 
				+set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
			
 
				+set(CMAKE_CXX_EXTENSIONS OFF)
			
 
				+
			
 
				 include(kaldi-native-fbank)
			
 
				 include(ncnn)
			
 
				 include(portaudio)
			
--- a/cmake/ncnn.cmake
+++ b/cmake/ncnn.cmake
@@ -1,8 +1,16 @@
 
				 function(download_ncnn)
			
 
				   include(FetchContent)
			
 
				 
			
 
				-  set(ncnn_URL  "http://github.com/csukuangfj/ncnn/archive/refs/tags/sherpa-0.6.tar.gz")
			
 
				-  set(ncnn_HASH "SHA256=aac5298f00ae9ce447c2aefa6c46579dcb3a284b9ce17687c182ecf4d499b3c8")
			
 
				+  # We use a modified version of NCNN.
			
 
				+  # The changed code is in
			
 
				+  # https://github.com/csukuangfj/ncnn/pull/7
			
 
				+
			
 
				+  # If you don't have access to the internet, please download it to your
			
 
				+  # local drive and modify the following line according to your needs.
			
 
				+  # set(ncnn_URL  "file:///ceph-fj/fangjun/372e5f3d0e8b4024e377388b0f336bc4397a2f06.zip")
			
 
				+
			
 
				+  set(ncnn_URL  "https://github.com/csukuangfj/ncnn/archive/372e5f3d0e8b4024e377388b0f336bc4397a2f06.zip")
			
 
				+  set(ncnn_HASH "SHA256=1b1bcd510085c5173a1fb1f7d1459690b8919dd2fa527b1140e39d2a820e0ae0")
			
 
				 
			
 
				   FetchContent_Declare(ncnn
			
 
				     URL               ${ncnn_URL}
			
@@ -16,9 +24,6 @@ function(download_ncnn)
 
				   set(NCNN_PIXEL_DRAWING OFF CACHE BOOL "" FORCE)
			
 
				   set(NCNN_BUILD_BENCHMARK OFF CACHE BOOL "" FORCE)
			
 
				 
			
 
				-  set(NCNN_INT8 OFF CACHE BOOL "" FORCE) # TODO(fangjun): enable it
			
 
				-  set(NCNN_BF16 OFF CACHE BOOL "" FORCE) # TODO(fangjun): enable it
			
 
				-
			
 
				   set(NCNN_BUILD_TOOLS OFF CACHE BOOL "" FORCE)
			
 
				   set(NCNN_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
			
 
				   set(NCNN_BUILD_TESTS OFF CACHE BOOL "" FORCE)
			
@@ -61,8 +66,8 @@ function(download_ncnn)
 
				     ROIPooling
			
 
				     Scale
			
 
				     # Sigmoid
			
 
				-    Slice
			
 
				-    Softmax
			
 
				+    # Slice
			
 
				+    # Softmax
			
 
				     # Split
			
 
				     SPP
			
 
				     # TanH
			
@@ -111,15 +116,15 @@ function(download_ncnn)
 
				     GRU
			
 
				     MultiHeadAttention
			
 
				     GELU
			
 
				-    Convolution1D
			
 
				+    # Convolution1D
			
 
				     Pooling1D
			
 
				     # ConvolutionDepthWise1D
			
 
				     Convolution3D
			
 
				     ConvolutionDepthWise3D
			
 
				     Pooling3D
			
 
				-    MatMul
			
 
				+    # MatMul
			
 
				     Deconvolution1D
			
 
				-    DeconvolutionDepthWise1D
			
 
				+    # DeconvolutionDepthWise1D
			
 
				     Deconvolution3D
			
 
				     DeconvolutionDepthWise3D
			
 
				     Einsum
			
@@ -127,8 +132,12 @@ function(download_ncnn)
 
				     RelPositionalEncoding
			
 
				     MakePadMask
			
 
				     RelShift
			
 
				-    GLU
			
 
				+    # GLU
			
 
				+    Fold
			
 
				+    Unfold
			
 
				+    GridSample
			
 
				   )
			
 
				+
			
 
				   foreach(layer IN LISTS disabled_layers)
			
 
				     string(TOLOWER ${layer} name)
			
 
				     set(WITH_LAYER_${name} OFF CACHE BOOL "" FORCE)
			
--- a/sherpa-ncnn/csrc/CMakeLists.txt
+++ b/sherpa-ncnn/csrc/CMakeLists.txt
@@ -1,6 +1,7 @@
 
				 include_directories(${CMAKE_SOURCE_DIR})
			
 
				 
			
 
				 set(sherpa_ncnn_core_srcs
			
 
				+  conv-emformer-model.cc
			
 
				   decode.cc
			
 
				   features.cc
			
 
				   lstm-model.cc
			
--- a/sherpa-ncnn/csrc/conv-emformer-model.cc
+++ b/sherpa-ncnn/csrc/conv-emformer-model.cc
@@ -0,0 +1,202 @@
 
				+// sherpa-ncnn/csrc/conv-emformer-model.cc
			
 
				+//
			
 
				+// Copyright (c)  2022  Xiaomi Corporation
			
 
				+
			
 
				+#include "sherpa-ncnn/csrc/conv-emformer-model.h"
			
 
				+
			
 
				+#include <regex>
			
 
				+
			
 
				+#include "net.h"  // NOLINT
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+
			
 
				+ConvEmformerModel::ConvEmformerModel(const ModelConfig &config)
			
 
				+    : num_threads_(config.num_threads) {
			
 
				+  InitEncoder(config.encoder_param, config.encoder_bin);
			
 
				+  InitDecoder(config.decoder_param, config.decoder_bin);
			
 
				+  InitJoiner(config.joiner_param, config.joiner_bin);
			
 
				+
			
 
				+  InitEncoderInputOutputIndexes();
			
 
				+  InitDecoderInputOutputIndexes();
			
 
				+  InitJoinerInputOutputIndexes();
			
 
				+}
			
 
				+
			
 
				+std::pair<ncnn::Mat, std::vector<ncnn::Mat>> ConvEmformerModel::RunEncoder(
			
 
				+    ncnn::Mat &features, const std::vector<ncnn::Mat> &states) {
			
 
				+  std::vector<ncnn::Mat> _states;
			
 
				+
			
 
				+  const ncnn::Mat *p;
			
 
				+  if (states.empty()) {
			
 
				+    _states = GetEncoderInitStates();
			
 
				+    p = _states.data();
			
 
				+  } else {
			
 
				+    p = states.data();
			
 
				+  }
			
 
				+
			
 
				+  ncnn::Extractor encoder_ex = encoder_.create_extractor();
			
 
				+  encoder_ex.set_num_threads(num_threads_);
			
 
				+
			
 
				+  // Note: We ignore error check there
			
 
				+  encoder_ex.input(encoder_input_indexes_[0], features);
			
 
				+  for (int32_t i = 1; i != encoder_input_indexes_.size(); ++i) {
			
 
				+    encoder_ex.input(encoder_input_indexes_[i], p[i - 1]);
			
 
				+  }
			
 
				+
			
 
				+  ncnn::Mat encoder_out;
			
 
				+  encoder_ex.extract(encoder_output_indexes_[0], encoder_out);
			
 
				+
			
 
				+  std::vector<ncnn::Mat> next_states(num_layers_ * 4);
			
 
				+  for (int32_t i = 1; i != encoder_output_indexes_.size(); ++i) {
			
 
				+    encoder_ex.extract(encoder_output_indexes_[i], next_states[i - 1]);
			
 
				+  }
			
 
				+
			
 
				+  return {encoder_out, next_states};
			
 
				+}
			
 
				+
			
 
				+ncnn::Mat ConvEmformerModel::RunDecoder(ncnn::Mat &decoder_input) {
			
 
				+  ncnn::Extractor decoder_ex = decoder_.create_extractor();
			
 
				+  decoder_ex.set_num_threads(num_threads_);
			
 
				+
			
 
				+  ncnn::Mat decoder_out;
			
 
				+  decoder_ex.input(decoder_input_indexes_[0], decoder_input);
			
 
				+  decoder_ex.extract(decoder_output_indexes_[0], decoder_out);
			
 
				+  decoder_out = decoder_out.reshape(decoder_out.w);
			
 
				+
			
 
				+  return decoder_out;
			
 
				+}
			
 
				+
			
 
				+ncnn::Mat ConvEmformerModel::RunJoiner(ncnn::Mat &encoder_out,
			
 
				+                                       ncnn::Mat &decoder_out) {
			
 
				+  auto joiner_ex = joiner_.create_extractor();
			
 
				+  joiner_ex.set_num_threads(num_threads_);
			
 
				+  joiner_ex.input(joiner_input_indexes_[0], encoder_out);
			
 
				+  joiner_ex.input(joiner_input_indexes_[1], decoder_out);
			
 
				+
			
 
				+  ncnn::Mat joiner_out;
			
 
				+  joiner_ex.extract("out0", joiner_out);
			
 
				+  return joiner_out;
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitEncoder(const std::string &encoder_param,
			
 
				+                                    const std::string &encoder_bin) {
			
 
				+  InitNet(encoder_, encoder_param, encoder_bin);
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitDecoder(const std::string &decoder_param,
			
 
				+                                    const std::string &decoder_bin) {
			
 
				+  InitNet(decoder_, decoder_param, decoder_bin);
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitJoiner(const std::string &joiner_param,
			
 
				+                                   const std::string &joiner_bin) {
			
 
				+  InitNet(joiner_, joiner_param, joiner_bin);
			
 
				+}
			
 
				+
			
 
				+std::vector<ncnn::Mat> ConvEmformerModel::GetEncoderInitStates() const {
			
 
				+  std::vector<ncnn::Mat> states;
			
 
				+  states.reserve(num_layers_ * 4);
			
 
				+
			
 
				+  for (int32_t i = 0; i != num_layers_; ++i) {
			
 
				+    auto s0 = ncnn::Mat(d_model_, memory_size_);
			
 
				+    auto s1 = ncnn::Mat(d_model_, left_context_length_);
			
 
				+    auto s2 = ncnn::Mat(d_model_, left_context_length_);
			
 
				+    auto s3 = ncnn::Mat(cnn_module_kernel_ - 1, d_model_);
			
 
				+
			
 
				+    s0.fill(0);
			
 
				+    s1.fill(0);
			
 
				+    s2.fill(0);
			
 
				+    s3.fill(0);
			
 
				+
			
 
				+    states.push_back(s0);
			
 
				+    states.push_back(s1);
			
 
				+    states.push_back(s2);
			
 
				+    states.push_back(s3);
			
 
				+  }
			
 
				+
			
 
				+  return states;
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitEncoderInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, features,
			
 
				+  // [1] -> in1, layer0, s0
			
 
				+  // [2] -> in2, layer0, s1
			
 
				+  // [3] -> in3, layer0, s2
			
 
				+  // [4] -> in4, layer0, s3
			
 
				+  //
			
 
				+  // [5] -> in5, layer1, s0
			
 
				+  // [6] -> in6, layer1, s1
			
 
				+  // [7] -> in7, layer1, s2
			
 
				+  // [8] -> in8, layer1, s3
			
 
				+  //
			
 
				+  // until layer 11
			
 
				+  encoder_input_indexes_.resize(1 + num_layers_ * 4);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, encoder_out
			
 
				+  //
			
 
				+  // [1] -> out1, layer0, s0
			
 
				+  // [2] -> out2, layer0, s1
			
 
				+  // [3] -> out3, layer0, s2
			
 
				+  // [4] -> out4, layer0, s3
			
 
				+  //
			
 
				+  // [5] -> out5, layer1, s0
			
 
				+  // [6] -> out6, layer1, s1
			
 
				+  // [7] -> out7, layer1, s2
			
 
				+  // [8] -> out8, layer1, s3
			
 
				+  encoder_output_indexes_.resize(1 + num_layers_ * 4);
			
 
				+  const auto &blobs = encoder_.blobs();
			
 
				+
			
 
				+  std::regex in_regex("in(\\d+)");
			
 
				+  std::regex out_regex("out(\\d+)");
			
 
				+
			
 
				+  std::smatch match;
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (std::regex_match(b.name, match, in_regex)) {
			
 
				+      auto index = std::atoi(match[1].str().c_str());
			
 
				+      encoder_input_indexes_[index] = i;
			
 
				+    } else if (std::regex_match(b.name, match, out_regex)) {
			
 
				+      auto index = std::atoi(match[1].str().c_str());
			
 
				+      encoder_output_indexes_[index] = i;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitDecoderInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, decoder_input,
			
 
				+  decoder_input_indexes_.resize(1);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, decoder_out,
			
 
				+  decoder_output_indexes_.resize(1);
			
 
				+
			
 
				+  const auto &blobs = decoder_.blobs();
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (b.name == "in0") decoder_input_indexes_[0] = i;
			
 
				+    if (b.name == "out0") decoder_output_indexes_[0] = i;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void ConvEmformerModel::InitJoinerInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, encoder_input,
			
 
				+  // [1] -> in1, decoder_input,
			
 
				+  joiner_input_indexes_.resize(2);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, joiner_out,
			
 
				+  joiner_output_indexes_.resize(1);
			
 
				+
			
 
				+  const auto &blobs = joiner_.blobs();
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (b.name == "in0") joiner_input_indexes_[0] = i;
			
 
				+    if (b.name == "in1") joiner_input_indexes_[1] = i;
			
 
				+    if (b.name == "out0") joiner_output_indexes_[0] = i;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/csrc/conv-emformer-model.h
+++ b/sherpa-ncnn/csrc/conv-emformer-model.h
@@ -0,0 +1,74 @@
 
				+// sherpa-ncnn/csrc/conv-emformer-model.h
			
 
				+//
			
 
				+// Copyright (c)  2022  Xiaomi Corporation
			
 
				+
			
 
				+#include "net.h"  // NOLINT
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+// Please refer to https://github.com/k2-fsa/icefall/pull/717
			
 
				+// for how the model is converted from icefall to ncnn
			
 
				+class ConvEmformerModel : public Model {
			
 
				+ public:
			
 
				+  explicit ConvEmformerModel(const ModelConfig &config);
			
 
				+
			
 
				+  std::pair<ncnn::Mat, std::vector<ncnn::Mat>> RunEncoder(
			
 
				+      ncnn::Mat &features, const std::vector<ncnn::Mat> &states) override;
			
 
				+
			
 
				+  ncnn::Mat RunDecoder(ncnn::Mat &decoder_input) override;
			
 
				+
			
 
				+  ncnn::Mat RunJoiner(ncnn::Mat &encoder_out, ncnn::Mat &decoder_out) override;
			
 
				+
			
 
				+  int32_t Segment() const override {
			
 
				+    // chunk_length 32
			
 
				+    // right_context 8
			
 
				+    // subsampling factor 4
			
 
				+    //
			
 
				+    // segment = 32 + (8 + 2 * 4 + 3) = 32 + 19 = 51
			
 
				+    return 51;
			
 
				+  }
			
 
				+
			
 
				+  // Advance the feature extract by this number of frames after
			
 
				+  // running the encoder network
			
 
				+  int32_t Offset() const override { return chunk_length_; }
			
 
				+
			
 
				+ private:
			
 
				+  void InitEncoder(const std::string &encoder_param,
			
 
				+                   const std::string &encoder_bin);
			
 
				+  void InitDecoder(const std::string &decoder_param,
			
 
				+                   const std::string &decoder_bin);
			
 
				+  void InitJoiner(const std::string &joiner_param,
			
 
				+                  const std::string &joiner_bin);
			
 
				+
			
 
				+  std::vector<ncnn::Mat> GetEncoderInitStates() const;
			
 
				+
			
 
				+  void InitEncoderInputOutputIndexes();
			
 
				+  void InitDecoderInputOutputIndexes();
			
 
				+  void InitJoinerInputOutputIndexes();
			
 
				+
			
 
				+ private:
			
 
				+  ncnn::Net encoder_;
			
 
				+  ncnn::Net decoder_;
			
 
				+  ncnn::Net joiner_;
			
 
				+
			
 
				+  int32_t num_threads_;
			
 
				+
			
 
				+  int32_t num_layers_ = 12;
			
 
				+  int32_t memory_size_ = 32;
			
 
				+  int32_t cnn_module_kernel_ = 31;
			
 
				+  int32_t left_context_length_ = 32 / 4;
			
 
				+  int32_t chunk_length_ = 32;
			
 
				+  int32_t right_context_length_ = 8;
			
 
				+  int32_t d_model_ = 512;
			
 
				+
			
 
				+  std::vector<int32_t> encoder_input_indexes_;
			
 
				+  std::vector<int32_t> encoder_output_indexes_;
			
 
				+
			
 
				+  std::vector<int32_t> decoder_input_indexes_;
			
 
				+  std::vector<int32_t> decoder_output_indexes_;
			
 
				+
			
 
				+  std::vector<int32_t> joiner_input_indexes_;
			
 
				+  std::vector<int32_t> joiner_output_indexes_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/csrc/lstm-model.cc
+++ b/sherpa-ncnn/csrc/lstm-model.cc
@@ -17,67 +17,52 @@
 
				  */
			
 
				 #include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				 
			
 
				-#include <iostream>
			
 
				 #include <utility>
			
 
				 #include <vector>
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-static void InitNet(ncnn::Net &net, const std::string &param,
			
 
				-                    const std::string &bin) {
			
 
				-  if (net.load_param(param.c_str())) {
			
 
				-    std::cerr << "failed to load " << param << "\n";
			
 
				-    exit(-1);
			
 
				-  }
			
 
				-
			
 
				-  if (net.load_model(bin.c_str())) {
			
 
				-    std::cerr << "failed to load " << bin << "\n";
			
 
				-    exit(-1);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				 LstmModel::LstmModel(const ModelConfig &config)
			
 
				     : num_threads_(config.num_threads) {
			
 
				   InitEncoder(config.encoder_param, config.encoder_bin);
			
 
				   InitDecoder(config.decoder_param, config.decoder_bin);
			
 
				   InitJoiner(config.joiner_param, config.joiner_bin);
			
 
				+
			
 
				+  InitEncoderInputOutputIndexes();
			
 
				+  InitDecoderInputOutputIndexes();
			
 
				+  InitJoinerInputOutputIndexes();
			
 
				 }
			
 
				 
			
 
				 std::pair<ncnn::Mat, std::vector<ncnn::Mat>> LstmModel::RunEncoder(
			
 
				     ncnn::Mat &features, const std::vector<ncnn::Mat> &states) {
			
 
				-  int32_t num_encoder_layers = 12;
			
 
				-  int32_t d_model = 512;
			
 
				-  int32_t rnn_hidden_size = 1024;
			
 
				   ncnn::Mat hx;
			
 
				   ncnn::Mat cx;
			
 
				 
			
 
				   if (states.empty()) {
			
 
				-    hx.create(d_model, num_encoder_layers);
			
 
				-    cx.create(rnn_hidden_size, num_encoder_layers);
			
 
				-
			
 
				-    hx.fill(0);
			
 
				-    cx.fill(0);
			
 
				+    auto s = GetEncoderInitStates();
			
 
				+    hx = s[0];
			
 
				+    cx = s[1];
			
 
				   } else {
			
 
				     hx = states[0];
			
 
				     cx = states[1];
			
 
				   }
			
 
				 
			
 
				-  ncnn::Mat feature_lengths(1);
			
 
				-  feature_lengths[0] = features.h;
			
 
				+  ncnn::Mat feature_length(1);
			
 
				+  feature_length[0] = features.h;
			
 
				 
			
 
				   ncnn::Extractor encoder_ex = encoder_.create_extractor();
			
 
				   encoder_ex.set_num_threads(num_threads_);
			
 
				 
			
 
				-  encoder_ex.input("in0", features);
			
 
				-  encoder_ex.input("in1", feature_lengths);
			
 
				-  encoder_ex.input("in2", hx);
			
 
				-  encoder_ex.input("in3", cx);
			
 
				+  encoder_ex.input(encoder_input_indexes_[0], features);
			
 
				+  encoder_ex.input(encoder_input_indexes_[1], feature_length);
			
 
				+  encoder_ex.input(encoder_input_indexes_[2], hx);
			
 
				+  encoder_ex.input(encoder_input_indexes_[3], cx);
			
 
				 
			
 
				   ncnn::Mat encoder_out;
			
 
				-  encoder_ex.extract("out0", encoder_out);
			
 
				+  encoder_ex.extract(encoder_output_indexes_[0], encoder_out);
			
 
				 
			
 
				-  encoder_ex.extract("out2", hx);
			
 
				-  encoder_ex.extract("out3", cx);
			
 
				+  encoder_ex.extract(encoder_output_indexes_[1], hx);
			
 
				+  encoder_ex.extract(encoder_output_indexes_[2], cx);
			
 
				 
			
 
				   std::vector<ncnn::Mat> next_states = {hx, cx};
			
 
				 
			
@@ -89,8 +74,8 @@ ncnn::Mat LstmModel::RunDecoder(ncnn::Mat &decoder_input) {
 
				   decoder_ex.set_num_threads(num_threads_);
			
 
				 
			
 
				   ncnn::Mat decoder_out;
			
 
				-  decoder_ex.input("in0", decoder_input);
			
 
				-  decoder_ex.extract("out0", decoder_out);
			
 
				+  decoder_ex.input(decoder_input_indexes_[0], decoder_input);
			
 
				+  decoder_ex.extract(decoder_output_indexes_[0], decoder_out);
			
 
				   decoder_out = decoder_out.reshape(decoder_out.w);
			
 
				 
			
 
				   return decoder_out;
			
@@ -99,11 +84,11 @@ ncnn::Mat LstmModel::RunDecoder(ncnn::Mat &decoder_input) {
 
				 ncnn::Mat LstmModel::RunJoiner(ncnn::Mat &encoder_out, ncnn::Mat &decoder_out) {
			
 
				   auto joiner_ex = joiner_.create_extractor();
			
 
				   joiner_ex.set_num_threads(num_threads_);
			
 
				-  joiner_ex.input("in0", encoder_out);
			
 
				-  joiner_ex.input("in1", decoder_out);
			
 
				+  joiner_ex.input(joiner_input_indexes_[0], encoder_out);
			
 
				+  joiner_ex.input(joiner_input_indexes_[1], decoder_out);
			
 
				 
			
 
				   ncnn::Mat joiner_out;
			
 
				-  joiner_ex.extract("out0", joiner_out);
			
 
				+  joiner_ex.extract(joiner_output_indexes_[0], joiner_out);
			
 
				   return joiner_out;
			
 
				 }
			
 
				 
			
@@ -124,4 +109,80 @@ void LstmModel::InitJoiner(const std::string &joiner_param,
 
				   InitNet(joiner_, joiner_param, joiner_bin);
			
 
				 }
			
 
				 
			
 
				+std::vector<ncnn::Mat> LstmModel::GetEncoderInitStates() const {
			
 
				+  int32_t num_encoder_layers = 12;
			
 
				+  int32_t d_model = 512;
			
 
				+  int32_t rnn_hidden_size = 1024;
			
 
				+
			
 
				+  auto hx = ncnn::Mat(d_model, num_encoder_layers);
			
 
				+  auto cx = ncnn::Mat(rnn_hidden_size, num_encoder_layers);
			
 
				+
			
 
				+  hx.fill(0);
			
 
				+  cx.fill(0);
			
 
				+
			
 
				+  return {hx, cx};
			
 
				+}
			
 
				+
			
 
				+void LstmModel::InitEncoderInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, features,
			
 
				+  // [1] -> in1, features_length
			
 
				+  // [2] -> in2, hx
			
 
				+  // [3] -> in3, cx
			
 
				+  encoder_input_indexes_.resize(4);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, encoder_out
			
 
				+  // [1] -> out2, hx
			
 
				+  // [2] -> out3, cx
			
 
				+  encoder_output_indexes_.resize(3);
			
 
				+  const auto &blobs = encoder_.blobs();
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (b.name == "in0") encoder_input_indexes_[0] = i;
			
 
				+    if (b.name == "in1") encoder_input_indexes_[1] = i;
			
 
				+    if (b.name == "in2") encoder_input_indexes_[2] = i;
			
 
				+    if (b.name == "in3") encoder_input_indexes_[3] = i;
			
 
				+    if (b.name == "out0") encoder_output_indexes_[0] = i;
			
 
				+    if (b.name == "out2") encoder_output_indexes_[1] = i;
			
 
				+    if (b.name == "out3") encoder_output_indexes_[2] = i;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void LstmModel::InitDecoderInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, decoder_input,
			
 
				+  decoder_input_indexes_.resize(1);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, decoder_out,
			
 
				+  decoder_output_indexes_.resize(1);
			
 
				+
			
 
				+  const auto &blobs = decoder_.blobs();
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (b.name == "in0") decoder_input_indexes_[0] = i;
			
 
				+    if (b.name == "out0") decoder_output_indexes_[0] = i;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void LstmModel::InitJoinerInputOutputIndexes() {
			
 
				+  // input indexes map
			
 
				+  // [0] -> in0, encoder_input,
			
 
				+  // [1] -> in1, decoder_input,
			
 
				+  joiner_input_indexes_.resize(2);
			
 
				+
			
 
				+  // output indexes map
			
 
				+  // [0] -> out0, joiner_out,
			
 
				+  joiner_output_indexes_.resize(1);
			
 
				+
			
 
				+  const auto &blobs = joiner_.blobs();
			
 
				+  for (int32_t i = 0; i != blobs.size(); ++i) {
			
 
				+    const auto &b = blobs[i];
			
 
				+    if (b.name == "in0") joiner_input_indexes_[0] = i;
			
 
				+    if (b.name == "in1") joiner_input_indexes_[1] = i;
			
 
				+    if (b.name == "out0") joiner_output_indexes_[0] = i;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 }  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/csrc/lstm-model.h
+++ b/sherpa-ncnn/csrc/lstm-model.h
@@ -73,10 +73,24 @@ class LstmModel : public Model {
 
				   void InitJoiner(const std::string &joiner_param,
			
 
				                   const std::string &joiner_bin);
			
 
				 
			
 
				+  std::vector<ncnn::Mat> GetEncoderInitStates() const;
			
 
				+
			
 
				+  void InitEncoderInputOutputIndexes();
			
 
				+  void InitDecoderInputOutputIndexes();
			
 
				+  void InitJoinerInputOutputIndexes();
			
 
				+
			
 
				  private:
			
 
				   ncnn::Net encoder_;
			
 
				   ncnn::Net decoder_;
			
 
				   ncnn::Net joiner_;
			
 
				+  std::vector<int32_t> encoder_input_indexes_;
			
 
				+  std::vector<int32_t> encoder_output_indexes_;
			
 
				+
			
 
				+  std::vector<int32_t> decoder_input_indexes_;
			
 
				+  std::vector<int32_t> decoder_output_indexes_;
			
 
				+
			
 
				+  std::vector<int32_t> joiner_input_indexes_;
			
 
				+  std::vector<int32_t> joiner_output_indexes_;
			
 
				 
			
 
				   int32_t num_threads_;
			
 
				 };
			
--- a/sherpa-ncnn/csrc/model.cc
+++ b/sherpa-ncnn/csrc/model.cc
@@ -19,6 +19,7 @@
 
				 
			
 
				 #include <sstream>
			
 
				 
			
 
				+#include "sherpa-ncnn/csrc/conv-emformer-model.h"
			
 
				 #include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
@@ -41,7 +42,7 @@ std::string ModelConfig::ToString() const {
 
				 
			
 
				 static bool IsLstmModel(const ncnn::Net &net) {
			
 
				   for (const auto &layer : net.layers()) {
			
 
				-    if (layer->type == "LSTM" || layer->type == "LSTM2") {
			
 
				+    if (layer->type == "LSTM") {
			
 
				       return true;
			
 
				     }
			
 
				   }
			
@@ -49,6 +50,38 @@ static bool IsLstmModel(const ncnn::Net &net) {
 
				   return false;
			
 
				 }
			
 
				 
			
 
				+static bool IsConvEmformerModel(const ncnn::Net &net) {
			
 
				+  // Note: We may need to add more constraints if number of models gets larger.
			
 
				+  if (net.input_indexes().size() < 49) {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  if (net.output_indexes().size() < 49) {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  for (const auto &layer : net.layers()) {
			
 
				+    if (layer->type == "GLU") {
			
 
				+      return true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+void Model::InitNet(ncnn::Net &net, const std::string &param,
			
 
				+                    const std::string &bin) {
			
 
				+  if (net.load_param(param.c_str())) {
			
 
				+    NCNN_LOGE("failed to load %s", param.c_str());
			
 
				+    exit(-1);
			
 
				+  }
			
 
				+
			
 
				+  if (net.load_model(bin.c_str())) {
			
 
				+    NCNN_LOGE("failed to load %s", bin.c_str());
			
 
				+    exit(-1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 std::unique_ptr<Model> Model::Create(const ModelConfig &config) {
			
 
				   // 1. Load the encoder network
			
 
				   // 2. If the encoder network has LSTM layers, we assume it is a LstmModel
			
@@ -67,6 +100,10 @@ std::unique_ptr<Model> Model::Create(const ModelConfig &config) {
 
				     return std::make_unique<LstmModel>(config);
			
 
				   }
			
 
				 
			
 
				+  if (IsConvEmformerModel(net)) {
			
 
				+    return std::make_unique<ConvEmformerModel>(config);
			
 
				+  }
			
 
				+
			
 
				   return nullptr;
			
 
				 }
			
 
				 
			
--- a/sherpa-ncnn/csrc/model.h
+++ b/sherpa-ncnn/csrc/model.h
@@ -39,10 +39,13 @@ struct ModelConfig {
 
				 
			
 
				 class Model {
			
 
				  public:
			
 
				+  virtual ~Model() = default;
			
 
				+
			
 
				   /** Create a model from a config. */
			
 
				   static std::unique_ptr<Model> Create(const ModelConfig &config);
			
 
				 
			
 
				-  virtual ~Model() = default;
			
 
				+  static void InitNet(ncnn::Net &net, const std::string &param,
			
 
				+                      const std::string &bin);
			
 
				 
			
 
				   /** Run the encoder network.
			
 
				    *