2 年之前 · 6f355b4211
--- a/sherpa-ncnn/csrc/CMakeLists.txt
+++ b/sherpa-ncnn/csrc/CMakeLists.txt
@@ -4,6 +4,7 @@ set(sherpa_ncnn_core_srcs
 
				   decode.cc
			
 
				   features.cc
			
 
				   lstm-model.cc
			
 
				+  model.cc
			
 
				   symbol-table.cc
			
 
				   wave-reader.cc
			
 
				 )
			
--- a/sherpa-ncnn/csrc/decode.cc
+++ b/sherpa-ncnn/csrc/decode.cc
@@ -20,15 +20,15 @@
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-void GreedySearch(LstmModel &model, ncnn::Mat &encoder_out,
			
 
				-                  ncnn::Mat *decoder_out, std::vector<int32_t> *hyp) {
			
 
				+void GreedySearch(Model *model, ncnn::Mat &encoder_out, ncnn::Mat *decoder_out,
			
 
				+                  std::vector<int32_t> *hyp) {
			
 
				   int32_t context_size = 2;
			
 
				   int32_t blank_id = 0;  // hard-code it to 0
			
 
				   ncnn::Mat decoder_input(context_size);
			
 
				 
			
 
				   for (int32_t t = 0; t != encoder_out.h; ++t) {
			
 
				     ncnn::Mat encoder_out_t(encoder_out.w, encoder_out.row(t));
			
 
				-    ncnn::Mat joiner_out = model.RunJoiner(encoder_out_t, *decoder_out);
			
 
				+    ncnn::Mat joiner_out = model->RunJoiner(encoder_out_t, *decoder_out);
			
 
				 
			
 
				     auto y = static_cast<int32_t>(std::distance(
			
 
				         static_cast<const float *>(joiner_out),
			
@@ -41,7 +41,7 @@ void GreedySearch(LstmModel &model, ncnn::Mat &encoder_out,
 
				       static_cast<int32_t *>(decoder_input)[1] = y;
			
 
				       hyp->push_back(y);
			
 
				 
			
 
				-      *decoder_out = model.RunDecoder(decoder_input);
			
 
				+      *decoder_out = model->RunDecoder(decoder_input);
			
 
				     }
			
 
				   }
			
 
				 }
			
--- a/sherpa-ncnn/csrc/decode.h
+++ b/sherpa-ncnn/csrc/decode.h
@@ -22,13 +22,13 @@
 
				 #include <vector>
			
 
				 
			
 
				 #include "net.h"  // NOLINT
			
 
				-#include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				 /**
			
 
				  *
			
 
				- * @param model  The LstmModel
			
 
				+ * @param model  The neural network.
			
 
				  * @param encoder_out  Its shape is (num_frames, encoder_out_dim).
			
 
				  *                     encoder_out.w == encoder_out_dim
			
 
				  *                     encoder_out.h == num_frames
			
@@ -37,8 +37,8 @@ namespace sherpa_ncnn {
 
				  *                     decoder_out.h == 1
			
 
				  * @param hyp The recognition result. It is changed in place.
			
 
				  */
			
 
				-void GreedySearch(LstmModel &model, ncnn::Mat &encoder_out,
			
 
				-                  ncnn::Mat *decoder_out, std::vector<int32_t> *hyp);
			
 
				+void GreedySearch(Model *model, ncnn::Mat &encoder_out, ncnn::Mat *decoder_out,
			
 
				+                  std::vector<int32_t> *hyp);
			
 
				 
			
 
				 }  // namespace sherpa_ncnn
			
 
				 
			
--- a/sherpa-ncnn/csrc/lstm-model.cc
+++ b/sherpa-ncnn/csrc/lstm-model.cc
@@ -18,6 +18,8 @@
 
				 #include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				 
			
 
				 #include <iostream>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
@@ -34,30 +36,30 @@ static void InitNet(ncnn::Net &net, const std::string &param,
 
				   }
			
 
				 }
			
 
				 
			
 
				-LstmModel::LstmModel(const std::string &encoder_param,
			
 
				-                     const std::string &encoder_bin,
			
 
				-                     const std::string &decoder_param,
			
 
				-                     const std::string &decoder_bin,
			
 
				-                     const std::string &joiner_param,
			
 
				-                     const std::string &joiner_bin, int32_t num_threads)
			
 
				-    : num_threads_(num_threads) {
			
 
				-  InitEncoder(encoder_param, encoder_bin);
			
 
				-  InitDecoder(decoder_param, decoder_bin);
			
 
				-  InitJoiner(joiner_param, joiner_bin);
			
 
				+LstmModel::LstmModel(const ModelConfig &config)
			
 
				+    : num_threads_(config.num_threads) {
			
 
				+  InitEncoder(config.encoder_param, config.encoder_bin);
			
 
				+  InitDecoder(config.decoder_param, config.decoder_bin);
			
 
				+  InitJoiner(config.joiner_param, config.joiner_bin);
			
 
				 }
			
 
				 
			
 
				-ncnn::Mat LstmModel::RunEncoder(ncnn::Mat &features, ncnn::Mat *hx,
			
 
				-                                ncnn::Mat *cx) {
			
 
				+std::pair<ncnn::Mat, std::vector<ncnn::Mat>> LstmModel::RunEncoder(
			
 
				+    ncnn::Mat &features, const std::vector<ncnn::Mat> &states) {
			
 
				   int32_t num_encoder_layers = 12;
			
 
				   int32_t d_model = 512;
			
 
				   int32_t rnn_hidden_size = 1024;
			
 
				-
			
 
				-  if (hx->empty()) {
			
 
				-    hx->create(d_model, num_encoder_layers);
			
 
				-    cx->create(rnn_hidden_size, num_encoder_layers);
			
 
				-
			
 
				-    hx->fill(0);
			
 
				-    cx->fill(0);
			
 
				+  ncnn::Mat hx;
			
 
				+  ncnn::Mat cx;
			
 
				+
			
 
				+  if (states.empty()) {
			
 
				+    hx.create(d_model, num_encoder_layers);
			
 
				+    cx.create(rnn_hidden_size, num_encoder_layers);
			
 
				+
			
 
				+    hx.fill(0);
			
 
				+    cx.fill(0);
			
 
				+  } else {
			
 
				+    hx = states[0];
			
 
				+    cx = states[1];
			
 
				   }
			
 
				 
			
 
				   ncnn::Mat feature_lengths(1);
			
@@ -68,16 +70,18 @@ ncnn::Mat LstmModel::RunEncoder(ncnn::Mat &features, ncnn::Mat *hx,
 
				 
			
 
				   encoder_ex.input("in0", features);
			
 
				   encoder_ex.input("in1", feature_lengths);
			
 
				-  encoder_ex.input("in2", *hx);
			
 
				-  encoder_ex.input("in3", *cx);
			
 
				+  encoder_ex.input("in2", hx);
			
 
				+  encoder_ex.input("in3", cx);
			
 
				 
			
 
				   ncnn::Mat encoder_out;
			
 
				   encoder_ex.extract("out0", encoder_out);
			
 
				 
			
 
				-  encoder_ex.extract("out2", *hx);
			
 
				-  encoder_ex.extract("out3", *cx);
			
 
				+  encoder_ex.extract("out2", hx);
			
 
				+  encoder_ex.extract("out3", cx);
			
 
				+
			
 
				+  std::vector<ncnn::Mat> next_states = {hx, cx};
			
 
				 
			
 
				-  return encoder_out;
			
 
				+  return {encoder_out, next_states};
			
 
				 }
			
 
				 
			
 
				 ncnn::Mat LstmModel::RunDecoder(ncnn::Mat &decoder_input) {
			
--- a/sherpa-ncnn/csrc/lstm-model.h
+++ b/sherpa-ncnn/csrc/lstm-model.h
@@ -20,65 +20,50 @@
 
				 #define SHERPA_NCNN_CSRC_LSTM_MODEL_H_
			
 
				 
			
 
				 #include <string>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				 
			
 
				 #include "net.h"  // NOLINT
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				-class LstmModel {
			
 
				+class LstmModel : public Model {
			
 
				  public:
			
 
				-  /**
			
 
				-   * @param encoder_param Path to encoder.ncnn.param
			
 
				-   * @param encoder_bin Path to encoder.ncnn.bin
			
 
				-   * @param decoder_param Path to decoder.ncnn.param
			
 
				-   * @param decoder_bin Path to decoder.ncnn.bin
			
 
				-   * @param joiner_param Path to joiner.ncnn.param
			
 
				-   * @param joiner_bin Path to joiner.ncnn.bin
			
 
				-   * @param num_threads Number of threads to use when running the network
			
 
				-   */
			
 
				-  LstmModel(const std::string &encoder_param, const std::string &encoder_bin,
			
 
				-            const std::string &decoder_param, const std::string &decoder_bin,
			
 
				-            const std::string &joiner_param, const std::string &joiner_bin,
			
 
				-            int32_t num_threads);
			
 
				+  explicit LstmModel(const ModelConfig &config);
			
 
				 
			
 
				   /** Run the encoder network.
			
 
				    *
			
 
				    * @param features  A 2-d mat of shape (num_frames, feature_dim).
			
 
				    *                  Note: features.w = feature_dim.
			
 
				    *                        features.h = num_frames.
			
 
				-   * @param hx  Hidden state of the LSTM model. You can leave it to empty
			
 
				-   *            on the first invocation. It is changed in-place.
			
 
				+   * @param states Contains two tensors:
			
 
				+   *          - hx  Hidden state of the LSTM model. You can leave it to empty
			
 
				+   *                on the first invocation. It is changed in-place.
			
 
				    *
			
 
				-   * @param cx  Hidden cell state of the LSTM model. You can leave it to empty
			
 
				-   *            on the first invocation. It is changed in-place.
			
 
				+   *          - cx  Hidden cell state of the LSTM model. You can leave it to
			
 
				+   *                empty on the first invocation. It is changed in-place.
			
 
				    *
			
 
				-   * @return Return the output of the encoder. Its shape is
			
 
				-   *  (num_out_frames, encoder_dim).
			
 
				-   *  Note: ans.w == encoder_dim; ans.h == num_out_frames
			
 
				-   */
			
 
				-  ncnn::Mat RunEncoder(ncnn::Mat &features, ncnn::Mat *hx, ncnn::Mat *cx);
			
 
				-
			
 
				-  /** Run the decoder network.
			
 
				+   *          - Note: on the first invocation, you can pass an empty vector.
			
 
				    *
			
 
				-   * @param  decoder_input A mat of shape (context_size,). Note: Its underlying
			
 
				-   *                       content consists of integers, though its type is
			
 
				-   *                       float.
			
 
				+   * @return Return a pair containing:
			
 
				+   *   - the output of the encoder. Its shape is (num_out_frames, encoder_dim).
			
 
				+   *     Note: ans.w == encoder_dim; ans.h == num_out_frames
			
 
				    *
			
 
				-   * @return Return a mat of shape (decoder_dim,)
			
 
				+   *   - next_states, a vector containing hx and cx for the next invocation
			
 
				    */
			
 
				-  ncnn::Mat RunDecoder(ncnn::Mat &decoder_input);
			
 
				+  std::pair<ncnn::Mat, std::vector<ncnn::Mat>> RunEncoder(
			
 
				+      ncnn::Mat &features, const std::vector<ncnn::Mat> &states) override;
			
 
				 
			
 
				-  /** Run the joiner network.
			
 
				-   *
			
 
				-   * @param encoder_out  A mat of shape (encoder_dim,)
			
 
				-   * @param decoder_out  A mat of shape (decoder_dim,)
			
 
				-   *
			
 
				-   * @return Return the joiner output which is of shape (vocab_size,)
			
 
				-   */
			
 
				-  ncnn::Mat RunJoiner(ncnn::Mat &encoder_out, ncnn::Mat &decoder_out);
			
 
				+  ncnn::Mat RunDecoder(ncnn::Mat &decoder_input) override;
			
 
				+
			
 
				+  ncnn::Mat RunJoiner(ncnn::Mat &encoder_out, ncnn::Mat &decoder_out) override;
			
 
				+
			
 
				+  int32_t Segment() const override { return 9; }
			
 
				 
			
 
				-  int32_t ContextSize() const { return 2; }
			
 
				-  int32_t BlankId() const { return 0; }
			
 
				+  // Advance the feature extract by this number of frames after
			
 
				+  // running the encoder network
			
 
				+  int32_t Offset() const override { return 4; }
			
 
				 
			
 
				  private:
			
 
				   void InitEncoder(const std::string &encoder_param,
			
--- a/sherpa-ncnn/csrc/model.cc
+++ b/sherpa-ncnn/csrc/model.cc
@@ -0,0 +1,73 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				+
			
 
				+#include <sstream>
			
 
				+
			
 
				+#include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+
			
 
				+std::string ModelConfig::ToString() const {
			
 
				+  std::ostringstream os;
			
 
				+  os << "encoder_param: " << encoder_param << "\n";
			
 
				+  os << "encoder_bin: " << encoder_bin << "\n";
			
 
				+
			
 
				+  os << "decoder_param: " << decoder_param << "\n";
			
 
				+  os << "decoder_bin: " << decoder_bin << "\n";
			
 
				+
			
 
				+  os << "joiner_param: " << joiner_param << "\n";
			
 
				+  os << "joiner_bin: " << joiner_bin << "\n";
			
 
				+
			
 
				+  os << "num_threads: " << num_threads << "\n";
			
 
				+
			
 
				+  return os.str();
			
 
				+}
			
 
				+
			
 
				+static bool IsLstmModel(const ncnn::Net &net) {
			
 
				+  for (const auto &layer : net.layers()) {
			
 
				+    if (layer->type == "LSTM" || layer->type == "LSTM2") {
			
 
				+      return true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+std::unique_ptr<Model> Model::Create(const ModelConfig &config) {
			
 
				+  // 1. Load the encoder network
			
 
				+  // 2. If the encoder network has LSTM layers, we assume it is a LstmModel
			
 
				+  // 3. Otherwise, we assume it is a ConvEmformer
			
 
				+  // 4. TODO(fangjun): We need to change this function to support more models
			
 
				+  // in the future
			
 
				+
			
 
				+  ncnn::Net net;
			
 
				+  auto ret = net.load_param(config.encoder_param.c_str());
			
 
				+  if (ret != 0) {
			
 
				+    NCNN_LOGE("Failed to load %s", config.encoder_param.c_str());
			
 
				+    return nullptr;
			
 
				+  }
			
 
				+
			
 
				+  if (IsLstmModel(net)) {
			
 
				+    return std::make_unique<LstmModel>(config);
			
 
				+  }
			
 
				+
			
 
				+  return nullptr;
			
 
				+}
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/csrc/model.h
+++ b/sherpa-ncnn/csrc/model.h
@@ -0,0 +1,96 @@
 
				+/**
			
 
				+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
			
 
				+ *
			
 
				+ * See LICENSE for clarification regarding multiple authors
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef SHERPA_NCNN_CSRC_MODEL_H_
			
 
				+#define SHERPA_NCNN_CSRC_MODEL_H_
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+
			
 
				+#include "net.h"  // NOLINT
			
 
				+
			
 
				+namespace sherpa_ncnn {
			
 
				+
			
 
				+struct ModelConfig {
			
 
				+  std::string encoder_param;  // path to encoder.ncnn.param
			
 
				+  std::string encoder_bin;    // path to encoder.ncnn.bin
			
 
				+  std::string decoder_param;  // path to decoder.ncnn.param
			
 
				+  std::string decoder_bin;    // path to decoder.ncnn.bin
			
 
				+  std::string joiner_param;   // path to joiner.ncnn.param
			
 
				+  std::string joiner_bin;     // path to joiner.ncnn.bin
			
 
				+  int32_t num_threads;        // number of threads to run the model
			
 
				+  std::string ToString() const;
			
 
				+};
			
 
				+
			
 
				+class Model {
			
 
				+ public:
			
 
				+  /** Create a model from a config. */
			
 
				+  static std::unique_ptr<Model> Create(const ModelConfig &config);
			
 
				+
			
 
				+  virtual ~Model() = default;
			
 
				+
			
 
				+  /** Run the encoder network.
			
 
				+   *
			
 
				+   * @param features  A 2-d mat of shape (num_frames, feature_dim).
			
 
				+   *                  Note: features.w = feature_dim.
			
 
				+   *                        features.h = num_frames.
			
 
				+   * @param states It contains the states for the encoder network. Its exact
			
 
				+   *               content is determined by the underlying network.
			
 
				+   *
			
 
				+   * @return Return a pair containing:
			
 
				+   *   - encoder_out
			
 
				+   *   - next_states
			
 
				+   */
			
 
				+  virtual std::pair<ncnn::Mat, std::vector<ncnn::Mat>> RunEncoder(
			
 
				+      ncnn::Mat &features, const std::vector<ncnn::Mat> &states) = 0;
			
 
				+
			
 
				+  /** Run the decoder network.
			
 
				+   *
			
 
				+   * @param  decoder_input A mat of shape (context_size,). Note: Its underlying
			
 
				+   *                       content consists of integers, though its type is
			
 
				+   *                       float.
			
 
				+   *
			
 
				+   * @return Return a mat of shape (decoder_dim,)
			
 
				+   */
			
 
				+  virtual ncnn::Mat RunDecoder(ncnn::Mat &decoder_input) = 0;
			
 
				+
			
 
				+  /** Run the joiner network.
			
 
				+   *
			
 
				+   * @param encoder_out  A mat of shape (encoder_dim,)
			
 
				+   * @param decoder_out  A mat of shape (decoder_dim,)
			
 
				+   *
			
 
				+   * @return Return the joiner output which is of shape (vocab_size,)
			
 
				+   */
			
 
				+  virtual ncnn::Mat RunJoiner(ncnn::Mat &encoder_out,
			
 
				+                              ncnn::Mat &decoder_out) = 0;
			
 
				+
			
 
				+  virtual int32_t ContextSize() const { return 2; }
			
 
				+
			
 
				+  virtual int32_t BlankId() const { return 0; }
			
 
				+
			
 
				+  // The encoder takes this number of frames as input
			
 
				+  virtual int32_t Segment() const = 0;
			
 
				+
			
 
				+  // Advance the feature extractor by this number of frames after
			
 
				+  // running the encoder network
			
 
				+  virtual int32_t Offset() const = 0;
			
 
				+};
			
 
				+
			
 
				+}  // namespace sherpa_ncnn
			
 
				+
			
 
				+#endif  // SHERPA_NCNN_CSRC_MODEL_H_
			
--- a/sherpa-ncnn/csrc/sherpa-ncnn-microphone.cc
+++ b/sherpa-ncnn/csrc/sherpa-ncnn-microphone.cc
@@ -23,8 +23,8 @@
 
				 #include "portaudio.h"  // NOLINT
			
 
				 #include "sherpa-ncnn/csrc/decode.h"
			
 
				 #include "sherpa-ncnn/csrc/features.h"
			
 
				-#include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				 #include "sherpa-ncnn/csrc/microphone.h"
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				 #include "sherpa-ncnn/csrc/symbol-table.h"
			
 
				 
			
 
				 bool stop = false;
			
@@ -71,25 +71,29 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				   }
			
 
				   signal(SIGINT, Handler);
			
 
				 
			
 
				+  sherpa_ncnn::ModelConfig config;
			
 
				+
			
 
				   std::string tokens = argv[1];
			
 
				-  std::string encoder_param = argv[2];
			
 
				-  std::string encoder_bin = argv[3];
			
 
				-  std::string decoder_param = argv[4];
			
 
				-  std::string decoder_bin = argv[5];
			
 
				-  std::string joiner_param = argv[6];
			
 
				-  std::string joiner_bin = argv[7];
			
 
				-
			
 
				-  int32_t num_threads = 4;
			
 
				+  config.encoder_param = argv[2];
			
 
				+  config.encoder_bin = argv[3];
			
 
				+  config.decoder_param = argv[4];
			
 
				+  config.decoder_bin = argv[5];
			
 
				+  config.joiner_param = argv[6];
			
 
				+  config.joiner_bin = argv[7];
			
 
				+
			
 
				+  config.num_threads = 4;
			
 
				   if (argc == 9) {
			
 
				-    num_threads = atoi(argv[8]);
			
 
				+    config.num_threads = atoi(argv[8]);
			
 
				   }
			
 
				 
			
 
				   sherpa_ncnn::SymbolTable sym(tokens);
			
 
				-  fprintf(stderr, "Number of threads: %d\n", num_threads);
			
 
				+  fprintf(stderr, "%s\n", config.ToString().c_str());
			
 
				 
			
 
				-  sherpa_ncnn::LstmModel model(encoder_param, encoder_bin, decoder_param,
			
 
				-                               decoder_bin, joiner_param, joiner_bin,
			
 
				-                               num_threads);
			
 
				+  auto model = sherpa_ncnn::Model::Create(config);
			
 
				+  if (!model) {
			
 
				+    fprintf(stderr, "Failed to create a model\n");
			
 
				+    exit(EXIT_FAILURE);
			
 
				+  }
			
 
				 
			
 
				   sherpa_ncnn::Microphone mic;
			
 
				 
			
@@ -139,11 +143,11 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				     exit(EXIT_FAILURE);
			
 
				   }
			
 
				 
			
 
				-  int32_t segment = 9;
			
 
				-  int32_t offset = 4;
			
 
				+  int32_t segment = model->Segment();
			
 
				+  int32_t offset = model->Offset();
			
 
				 
			
 
				-  int32_t context_size = model.ContextSize();
			
 
				-  int32_t blank_id = model.BlankId();
			
 
				+  int32_t context_size = model->ContextSize();
			
 
				+  int32_t blank_id = model->BlankId();
			
 
				 
			
 
				   std::vector<int32_t> hyp(context_size, blank_id);
			
 
				 
			
@@ -152,7 +156,7 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				     static_cast<int32_t *>(decoder_input)[i] = blank_id;
			
 
				   }
			
 
				 
			
 
				-  ncnn::Mat decoder_out = model.RunDecoder(decoder_input);
			
 
				+  ncnn::Mat decoder_out = model->RunDecoder(decoder_input);
			
 
				 
			
 
				   ncnn::Mat hx;
			
 
				   ncnn::Mat cx;
			
@@ -160,14 +164,17 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				   int32_t num_tokens = hyp.size();
			
 
				   int32_t num_processed = 0;
			
 
				 
			
 
				+  std::vector<ncnn::Mat> states;
			
 
				+  ncnn::Mat encoder_out;
			
 
				+
			
 
				   while (!stop) {
			
 
				     while (feature_extractor.NumFramesReady() - num_processed >= segment) {
			
 
				       ncnn::Mat features = feature_extractor.GetFrames(num_processed, segment);
			
 
				       num_processed += offset;
			
 
				 
			
 
				-      ncnn::Mat encoder_out = model.RunEncoder(features, &hx, &cx);
			
 
				+      std::tie(encoder_out, states) = model->RunEncoder(features, states);
			
 
				 
			
 
				-      GreedySearch(model, encoder_out, &decoder_out, &hyp);
			
 
				+      GreedySearch(model.get(), encoder_out, &decoder_out, &hyp);
			
 
				     }
			
 
				 
			
 
				     if (hyp.size() != num_tokens) {
			
--- a/sherpa-ncnn/csrc/sherpa-ncnn.cc
+++ b/sherpa-ncnn/csrc/sherpa-ncnn.cc
@@ -23,7 +23,7 @@
 
				 #include "net.h"  // NOLINT
			
 
				 #include "sherpa-ncnn/csrc/decode.h"
			
 
				 #include "sherpa-ncnn/csrc/features.h"
			
 
				-#include "sherpa-ncnn/csrc/lstm-model.h"
			
 
				+#include "sherpa-ncnn/csrc/model.h"
			
 
				 #include "sherpa-ncnn/csrc/symbol-table.h"
			
 
				 #include "sherpa-ncnn/csrc/wave-reader.h"
			
 
				 
			
@@ -89,29 +89,35 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				 
			
 
				     return 0;
			
 
				   }
			
 
				+  sherpa_ncnn::ModelConfig config;
			
 
				+
			
 
				   std::string tokens = argv[1];
			
 
				-  std::string encoder_param = argv[2];
			
 
				-  std::string encoder_bin = argv[3];
			
 
				-  std::string decoder_param = argv[4];
			
 
				-  std::string decoder_bin = argv[5];
			
 
				-  std::string joiner_param = argv[6];
			
 
				-  std::string joiner_bin = argv[7];
			
 
				+
			
 
				+  config.encoder_param = argv[2];
			
 
				+  config.encoder_bin = argv[3];
			
 
				+  config.decoder_param = argv[4];
			
 
				+  config.decoder_bin = argv[5];
			
 
				+  config.joiner_param = argv[6];
			
 
				+  config.joiner_bin = argv[7];
			
 
				+
			
 
				   std::string wav_filename = argv[8];
			
 
				 
			
 
				-  int32_t num_threads = 4;
			
 
				+  config.num_threads = 4;
			
 
				   if (argc == 10) {
			
 
				-    num_threads = atoi(argv[9]);
			
 
				+    config.num_threads = atoi(argv[9]);
			
 
				   }
			
 
				 
			
 
				   float expected_sampling_rate = 16000;
			
 
				 
			
 
				   sherpa_ncnn::SymbolTable sym(tokens);
			
 
				 
			
 
				-  std::cout << "number of threads: " << num_threads << "\n";
			
 
				+  std::cout << config.ToString() << "\n";
			
 
				 
			
 
				-  sherpa_ncnn::LstmModel model(encoder_param, encoder_bin, decoder_param,
			
 
				-                               decoder_bin, joiner_param, joiner_bin,
			
 
				-                               num_threads);
			
 
				+  auto model = sherpa_ncnn::Model::Create(config);
			
 
				+  if (!model) {
			
 
				+    std::cout << "Failed to create a model\n";
			
 
				+    exit(EXIT_FAILURE);
			
 
				+  }
			
 
				 
			
 
				   std::vector<float> samples =
			
 
				       sherpa_ncnn::ReadWave(wav_filename, expected_sampling_rate);
			
@@ -132,11 +138,11 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				 
			
 
				   feature_extractor.InputFinished();
			
 
				 
			
 
				-  int32_t segment = 9;
			
 
				-  int32_t offset = 4;
			
 
				+  int32_t segment = model->Segment();
			
 
				+  int32_t offset = model->Offset();
			
 
				 
			
 
				-  int32_t context_size = model.ContextSize();
			
 
				-  int32_t blank_id = model.BlankId();
			
 
				+  int32_t context_size = model->ContextSize();
			
 
				+  int32_t blank_id = model->BlankId();
			
 
				 
			
 
				   std::vector<int32_t> hyp(context_size, blank_id);
			
 
				 
			
@@ -145,19 +151,19 @@ https://huggingface.co/csukuangfj/sherpa-ncnn-2022-09-05
 
				     static_cast<int32_t *>(decoder_input)[i] = blank_id;
			
 
				   }
			
 
				 
			
 
				-  ncnn::Mat decoder_out = model.RunDecoder(decoder_input);
			
 
				+  ncnn::Mat decoder_out = model->RunDecoder(decoder_input);
			
 
				 
			
 
				-  ncnn::Mat hx;
			
 
				-  ncnn::Mat cx;
			
 
				+  std::vector<ncnn::Mat> states;
			
 
				+  ncnn::Mat encoder_out;
			
 
				 
			
 
				   int32_t num_processed = 0;
			
 
				   while (feature_extractor.NumFramesReady() - num_processed >= segment) {
			
 
				     ncnn::Mat features = feature_extractor.GetFrames(num_processed, segment);
			
 
				     num_processed += offset;
			
 
				 
			
 
				-    ncnn::Mat encoder_out = model.RunEncoder(features, &hx, &cx);
			
 
				+    std::tie(encoder_out, states) = model->RunEncoder(features, states);
			
 
				 
			
 
				-    GreedySearch(model, encoder_out, &decoder_out, &hyp);
			
 
				+    GreedySearch(model.get(), encoder_out, &decoder_out, &hyp);
			
 
				   }
			
 
				 
			
 
				   std::string text;