2 năm trước cách đây · 4a5b459941
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 
				 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
			
 
				 project(sherpa-ncnn)
			
 
				 
			
 
				-set(SHERPA_NCNN_VERSION "1.4.1")
			
 
				+set(SHERPA_NCNN_VERSION "1.4.2")
			
 
				 
			
 
				 # Disable warning about
			
 
				 #
			
--- a/sherpa-ncnn/csrc/hypothesis.cc
+++ b/sherpa-ncnn/csrc/hypothesis.cc
@@ -55,4 +55,26 @@ Hypothesis Hypotheses::GetMostProbable(bool length_norm) const {
 
				   }
			
 
				 }
			
 
				 
			
 
				+std::vector<Hypothesis> Hypotheses::GetTopK(int32_t k, bool length_norm) const {
			
 
				+  k = std::max(k, 1);
			
 
				+  k = std::min(k, Size());
			
 
				+
			
 
				+  std::vector<Hypothesis> all_hyps = Vec();
			
 
				+
			
 
				+  if (length_norm == false) {
			
 
				+    std::partial_sort(
			
 
				+        all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
			
 
				+        [](const auto &a, const auto &b) { return a.log_prob > b.log_prob; });
			
 
				+  } else {
			
 
				+    // for length_norm is true
			
 
				+    std::partial_sort(all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
			
 
				+                      [](const auto &a, const auto &b) {
			
 
				+                        return a.log_prob / a.ys.size() >
			
 
				+                               b.log_prob / b.ys.size();
			
 
				+                      });
			
 
				+  }
			
 
				+
			
 
				+  return {all_hyps.begin(), all_hyps.begin() + k};
			
 
				+}
			
 
				+
			
 
				 }  // namespace sherpa_ncnn
			
--- a/sherpa-ncnn/csrc/hypothesis.h
+++ b/sherpa-ncnn/csrc/hypothesis.h
@@ -83,13 +83,14 @@ class Hypotheses {
 
				   void Add(Hypothesis hyp);
			
 
				 
			
 
				   // Get the hyp that has the largest log_prob.
			
 
				-  // If length_norm is true, hyp's log_prob are divided by
			
 
				+  // If length_norm is true, hyp's log_prob is divided by
			
 
				   // len(hyp.ys) before comparison.
			
 
				   Hypothesis GetMostProbable(bool length_norm) const;
			
 
				 
			
 
				-  // Remove the given hyp from this object.
			
 
				-  // It is *NOT* an error if hyp does not exist in this object.
			
 
				-  void Remove(const Hypothesis &hyp) { hyps_dict_.erase(hyp.Key()); }
			
 
				+  // Get the k hyps that have the largest log_prob.
			
 
				+  // If length_norm is true, hyp's log_prob is divided by
			
 
				+  // len(hyp.ys) before comparison.
			
 
				+  std::vector<Hypothesis> GetTopK(int32_t k, bool length_norm) const;
			
 
				 
			
 
				   int32_t Size() const { return hyps_dict_.size(); }
			
 
				 
			
@@ -101,13 +102,21 @@ class Hypotheses {
 
				     return os.str();
			
 
				   }
			
 
				 
			
 
				-  auto begin() { return hyps_dict_.begin(); }
			
 
				-  auto end() { return hyps_dict_.end(); }
			
 
				-
			
 
				   const auto begin() const { return hyps_dict_.begin(); }
			
 
				   const auto end() const { return hyps_dict_.end(); }
			
 
				 
			
 
				-  void clear() { hyps_dict_.clear(); }
			
 
				+  void Clear() { hyps_dict_.clear(); }
			
 
				+
			
 
				+ private:
			
 
				+  // Return a list of hyps contained in this object.
			
 
				+  std::vector<Hypothesis> Vec() const {
			
 
				+    std::vector<Hypothesis> ans;
			
 
				+    ans.reserve(hyps_dict_.size());
			
 
				+    for (const auto &p : hyps_dict_) {
			
 
				+      ans.push_back(p.second);
			
 
				+    }
			
 
				+    return ans;
			
 
				+  }
			
 
				 
			
 
				  private:
			
 
				   using Map = std ::unordered_map<std::string, Hypothesis>;
			
--- a/sherpa-ncnn/csrc/math.h
+++ b/sherpa-ncnn/csrc/math.h
@@ -85,7 +85,7 @@ struct LogAdd<float> {
 
				 };
			
 
				 
			
 
				 template <class T>
			
 
				-void log_softmax(T *input, int32_t input_len) {
			
 
				+void LogSoftmax(T *input, int32_t input_len) {
			
 
				   assert(input);
			
 
				 
			
 
				   T m = *std::max_element(input, input + input_len);
			
@@ -102,7 +102,7 @@ void log_softmax(T *input, int32_t input_len) {
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-std::vector<int32_t> topk_index(const T *vec, int32_t size, int32_t topk) {
			
 
				+std::vector<int32_t> TopkIndex(const T *vec, int32_t size, int32_t topk) {
			
 
				   std::vector<int32_t> vec_index(size);
			
 
				   std::iota(vec_index.begin(), vec_index.end(), 0);
			
 
				 
			
--- a/sherpa-ncnn/csrc/modified-beam-search-decoder.cc
+++ b/sherpa-ncnn/csrc/modified-beam-search-decoder.cc
@@ -18,6 +18,7 @@
 
				  */
			
 
				 #include "sherpa-ncnn/csrc/modified-beam-search-decoder.h"
			
 
				 
			
 
				+#include <algorithm>
			
 
				 #include <string>
			
 
				 #include <utility>
			
 
				 
			
@@ -25,6 +26,74 @@
 
				 
			
 
				 namespace sherpa_ncnn {
			
 
				 
			
 
				+// @param in 1-D tensor of shape (encoder_dim,)
			
 
				+// @param n Number of times to repeat
			
 
				+// @return Return a 2-d tensor of shape (n, encoder_dim)
			
 
				+//
			
 
				+// TODO(fangjun): Remove this function
			
 
				+// once
			
 
				+// https://github.com/nihui/ncnn/tree/pnnx-ncnn-binary-broadcast
			
 
				+// gets merged
			
 
				+static ncnn::Mat RepeatEncoderOut(ncnn::Mat in, int32_t n) {
			
 
				+  int32_t w = in.w;
			
 
				+  ncnn::Mat out(w, n, sizeof(float));
			
 
				+
			
 
				+  const float *in_ptr = in;
			
 
				+  float *out_ptr = out;
			
 
				+
			
 
				+  for (int32_t i = 0; i != n; ++i) {
			
 
				+    std::copy(in_ptr, in_ptr + w, out_ptr);
			
 
				+    out_ptr += w;
			
 
				+  }
			
 
				+
			
 
				+  return out;
			
 
				+}
			
 
				+
			
 
				+// Compute log_softmax in-place.
			
 
				+//
			
 
				+// The log_softmax of each row is computed.
			
 
				+//
			
 
				+// @param in_out A 2-D tensor
			
 
				+static void LogSoftmax(ncnn::Mat *in_out) {
			
 
				+  int32_t h = in_out->h;
			
 
				+  int32_t w = in_out->w;
			
 
				+  for (int32_t y = 0; y != h; ++y) {
			
 
				+    float *p = in_out->row(y);
			
 
				+    LogSoftmax(p, w);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// The decoder model contains an embedding layer, which only supports
			
 
				+// 1-D output.
			
 
				+// This is a wrapper to support 2-D decoder output.
			
 
				+//
			
 
				+// @param model_ The NN model.
			
 
				+// @param decoder_input A 2-D tensor of shape (num_active_paths, context_size)
			
 
				+// @return Return a 2-D tensor of shape (num_active_paths, decoder_dim)
			
 
				+//
			
 
				+// TODO(fangjun): Change Embed in ncnn to output 2-d tensors
			
 
				+static ncnn::Mat RunDecoder2D(Model *model_, ncnn::Mat decoder_input) {
			
 
				+  ncnn::Mat decoder_out;
			
 
				+  int32_t h = decoder_input.h;
			
 
				+
			
 
				+  for (int32_t y = 0; y != h; ++y) {
			
 
				+    ncnn::Mat decoder_input_t =
			
 
				+        ncnn::Mat(decoder_input.w, decoder_input.row(y));
			
 
				+
			
 
				+    ncnn::Mat tmp = model_->RunDecoder(decoder_input_t);
			
 
				+
			
 
				+    if (y == 0) {
			
 
				+      decoder_out = ncnn::Mat(tmp.w, h);
			
 
				+    }
			
 
				+
			
 
				+    const float *ptr = tmp;
			
 
				+    float *out_ptr = decoder_out.row(y);
			
 
				+    std::copy(ptr, ptr + tmp.w, out_ptr);
			
 
				+  }
			
 
				+
			
 
				+  return decoder_out;
			
 
				+}
			
 
				+
			
 
				 void ModifiedBeamSearchDecoder::AcceptWaveform(const float sample_rate,
			
 
				                                                const float *input_buffer,
			
 
				                                                int32_t frames_per_buffer) {
			
@@ -32,11 +101,20 @@ void ModifiedBeamSearchDecoder::AcceptWaveform(const float sample_rate,
 
				                                     frames_per_buffer);
			
 
				 }
			
 
				 
			
 
				-void ModifiedBeamSearchDecoder::BuildDecoderInput(Hypothesis hyp) {
			
 
				-  for (int32_t i = 0; i != context_size_; ++i) {
			
 
				-    static_cast<int32_t *>(decoder_input_)[i] =
			
 
				-        *(hyp.ys.end() - context_size_ + i);
			
 
				+ncnn::Mat ModifiedBeamSearchDecoder::BuildDecoderInput(
			
 
				+    const std::vector<Hypothesis> &hyps) const {
			
 
				+  int32_t num_hyps = static_cast<int32_t>(hyps.size());
			
 
				+
			
 
				+  ncnn::Mat decoder_input(context_size_, num_hyps);
			
 
				+  auto p = static_cast<int32_t *>(decoder_input);
			
 
				+
			
 
				+  for (const auto &hyp : hyps) {
			
 
				+    const auto &ys = hyp.ys;
			
 
				+    std::copy(ys.end() - context_size_, ys.end(), p);
			
 
				+    p += context_size_;
			
 
				   }
			
 
				+
			
 
				+  return decoder_input;
			
 
				 }
			
 
				 
			
 
				 void ModifiedBeamSearchDecoder::ResetResult() {
			
@@ -50,45 +128,52 @@ void ModifiedBeamSearchDecoder::ResetResult() {
 
				 void ModifiedBeamSearchDecoder::Decode() {
			
 
				   while (feature_extractor_.NumFramesReady() - num_processed_ >= segment_) {
			
 
				     ncnn::Mat features = feature_extractor_.GetFrames(num_processed_, segment_);
			
 
				-    std::tie(encoder_out_, encoder_state_) =
			
 
				+    ncnn::Mat encoder_out;
			
 
				+    std::tie(encoder_out, encoder_state_) =
			
 
				         model_->RunEncoder(features, encoder_state_);
			
 
				 
			
 
				     Hypotheses cur = std::move(result_.hyps);
			
 
				-    /* encoder_out_.w == encoder_out_dim, encoder_out_.h == num_frames. */
			
 
				-    for (int32_t t = 0; t != encoder_out_.h; ++t) {
			
 
				-      std::vector<Hypothesis> prev;
			
 
				-      for (int32_t i = 0; i != config_.num_active_paths && cur.Size(); ++i) {
			
 
				-        auto cur_best_hyp = cur.GetMostProbable(true);
			
 
				-        cur.Remove(cur_best_hyp);
			
 
				-        prev.push_back(std::move(cur_best_hyp));
			
 
				-      }
			
 
				-      cur.clear();
			
 
				-
			
 
				-      for (const auto &h : prev) {
			
 
				-        ncnn::Mat encoder_out_t(encoder_out_.w, encoder_out_.row(t));
			
 
				-        BuildDecoderInput(h);
			
 
				-        decoder_out_ = model_->RunDecoder(decoder_input_);
			
 
				-        ncnn::Mat joiner_out = model_->RunJoiner(encoder_out_t, decoder_out_);
			
 
				-        auto joiner_out_ptr = joiner_out.row(0);
			
 
				-        log_softmax(joiner_out_ptr, joiner_out.w);
			
 
				-
			
 
				-        // update active_paths
			
 
				-        auto topk =
			
 
				-            topk_index(joiner_out_ptr, joiner_out.w, config_.num_active_paths);
			
 
				-        for (int i = 0; i != topk.size(); ++i) {
			
 
				-          Hypothesis new_hyp = h;
			
 
				-          int32_t new_token = topk[i];
			
 
				-          if (new_token != blank_id_) {
			
 
				-            new_hyp.ys.push_back(new_token);
			
 
				-            new_hyp.num_trailing_blanks = 0;
			
 
				-          } else {
			
 
				-            ++new_hyp.num_trailing_blanks;
			
 
				-          }
			
 
				-          new_hyp.log_prob += joiner_out_ptr[new_token];
			
 
				-          cur.Add(std::move(new_hyp));
			
 
				+    /* encoder_out.w == encoder_out_dim, encoder_out.h == num_frames. */
			
 
				+    for (int32_t t = 0; t != encoder_out.h; ++t) {
			
 
				+      std::vector<Hypothesis> prev =
			
 
				+          cur.GetTopK(config_.num_active_paths, true);
			
 
				+
			
 
				+      cur.Clear();
			
 
				+
			
 
				+      ncnn::Mat decoder_input = BuildDecoderInput(prev);
			
 
				+      ncnn::Mat decoder_out = RunDecoder2D(model_, decoder_input);
			
 
				+      // decoder_out.w == decoder_dim
			
 
				+      // decoder_out.h == num_active_paths
			
 
				+
			
 
				+      ncnn::Mat encoder_out_t(encoder_out.w, encoder_out.row(t));
			
 
				+      encoder_out_t = RepeatEncoderOut(encoder_out_t, decoder_out.h);
			
 
				+
			
 
				+      ncnn::Mat joiner_out = model_->RunJoiner(encoder_out_t, decoder_out);
			
 
				+      // joiner_out.w == vocab_size
			
 
				+      // joiner_out.h == num_active_paths
			
 
				+      LogSoftmax(&joiner_out);
			
 
				+      auto topk =
			
 
				+          TopkIndex(static_cast<float *>(joiner_out),
			
 
				+                    joiner_out.w * joiner_out.h, config_.num_active_paths);
			
 
				+
			
 
				+      for (auto i : topk) {
			
 
				+        int32_t hyp_index = i / joiner_out.w;
			
 
				+        int32_t new_token = i % joiner_out.w;
			
 
				+
			
 
				+        const float *p = joiner_out.row(hyp_index);
			
 
				+
			
 
				+        Hypothesis new_hyp = prev[hyp_index];
			
 
				+
			
 
				+        if (new_token != blank_id_) {
			
 
				+          new_hyp.ys.push_back(new_token);
			
 
				+          new_hyp.num_trailing_blanks = 0;
			
 
				+        } else {
			
 
				+          ++new_hyp.num_trailing_blanks;
			
 
				         }
			
 
				+        new_hyp.log_prob += p[new_token];
			
 
				+        cur.Add(std::move(new_hyp));
			
 
				       }
			
 
				-    }
			
 
				+    }  // for (int32_t t = 0; t != encoder_out.h; ++t) {
			
 
				 
			
 
				     num_processed_ += offset_;
			
 
				     result_.hyps = std::move(cur);
			
--- a/sherpa-ncnn/csrc/modified-beam-search-decoder.h
+++ b/sherpa-ncnn/csrc/modified-beam-search-decoder.h
@@ -42,7 +42,6 @@ class ModifiedBeamSearchDecoder : public Decoder {
 
				         context_size_(model_->ContextSize()),
			
 
				         segment_(model->Segment()),
			
 
				         offset_(model_->Offset()),
			
 
				-        decoder_input_(context_size_),
			
 
				         num_processed_(0),
			
 
				         endpoint_start_frame_(0),
			
 
				         endpoint_(endpoint) {
			
@@ -65,7 +64,7 @@ class ModifiedBeamSearchDecoder : public Decoder {
 
				   void InputFinished() override;
			
 
				 
			
 
				  private:
			
 
				-  void BuildDecoderInput(Hypothesis hyp);
			
 
				+  ncnn::Mat BuildDecoderInput(const std::vector<Hypothesis> &hyps) const;
			
 
				 
			
 
				   const DecoderConfig config_;
			
 
				   Model *model_;
			
@@ -75,10 +74,7 @@ class ModifiedBeamSearchDecoder : public Decoder {
 
				   const int32_t context_size_;
			
 
				   const int32_t segment_;
			
 
				   const int32_t offset_;
			
 
				-  ncnn::Mat encoder_out_;
			
 
				   std::vector<ncnn::Mat> encoder_state_;
			
 
				-  ncnn::Mat decoder_input_;
			
 
				-  ncnn::Mat decoder_out_;
			
 
				   int32_t num_processed_;
			
 
				   int32_t endpoint_start_frame_;
			
 
				   const Endpoint *endpoint_;