Browse Source

Optimize modified_beam_search (#98)

Fangjun Kuang 2 năm trước cách đây
mục cha
commit
4a5b459941

+ 1 - 1
CMakeLists.txt

@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-ncnn)
 
-set(SHERPA_NCNN_VERSION "1.4.1")
+set(SHERPA_NCNN_VERSION "1.4.2")
 
 # Disable warning about
 #

+ 22 - 0
sherpa-ncnn/csrc/hypothesis.cc

@@ -55,4 +55,26 @@ Hypothesis Hypotheses::GetMostProbable(bool length_norm) const {
   }
 }
 
+std::vector<Hypothesis> Hypotheses::GetTopK(int32_t k, bool length_norm) const {
+  k = std::max(k, 1);
+  k = std::min(k, Size());
+
+  std::vector<Hypothesis> all_hyps = Vec();
+
+  if (length_norm == false) {
+    std::partial_sort(
+        all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
+        [](const auto &a, const auto &b) { return a.log_prob > b.log_prob; });
+  } else {
+    // for length_norm is true
+    std::partial_sort(all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
+                      [](const auto &a, const auto &b) {
+                        return a.log_prob / a.ys.size() >
+                               b.log_prob / b.ys.size();
+                      });
+  }
+
+  return {all_hyps.begin(), all_hyps.begin() + k};
+}
+
 }  // namespace sherpa_ncnn

+ 17 - 8
sherpa-ncnn/csrc/hypothesis.h

@@ -83,13 +83,14 @@ class Hypotheses {
   void Add(Hypothesis hyp);
 
   // Get the hyp that has the largest log_prob.
-  // If length_norm is true, hyp's log_prob are divided by
+  // If length_norm is true, hyp's log_prob is divided by
   // len(hyp.ys) before comparison.
   Hypothesis GetMostProbable(bool length_norm) const;
 
-  // Remove the given hyp from this object.
-  // It is *NOT* an error if hyp does not exist in this object.
-  void Remove(const Hypothesis &hyp) { hyps_dict_.erase(hyp.Key()); }
+  // Get the k hyps that have the largest log_prob.
+  // If length_norm is true, hyp's log_prob is divided by
+  // len(hyp.ys) before comparison.
+  std::vector<Hypothesis> GetTopK(int32_t k, bool length_norm) const;
 
   int32_t Size() const { return hyps_dict_.size(); }
 
@@ -101,13 +102,21 @@ class Hypotheses {
     return os.str();
   }
 
-  auto begin() { return hyps_dict_.begin(); }
-  auto end() { return hyps_dict_.end(); }
-
   const auto begin() const { return hyps_dict_.begin(); }
   const auto end() const { return hyps_dict_.end(); }
 
-  void clear() { hyps_dict_.clear(); }
+  void Clear() { hyps_dict_.clear(); }
+
+ private:
+  // Return a list of hyps contained in this object.
+  std::vector<Hypothesis> Vec() const {
+    std::vector<Hypothesis> ans;
+    ans.reserve(hyps_dict_.size());
+    for (const auto &p : hyps_dict_) {
+      ans.push_back(p.second);
+    }
+    return ans;
+  }
 
  private:
   using Map = std ::unordered_map<std::string, Hypothesis>;

+ 2 - 2
sherpa-ncnn/csrc/math.h

@@ -85,7 +85,7 @@ struct LogAdd<float> {
 };
 
 template <class T>
-void log_softmax(T *input, int32_t input_len) {
+void LogSoftmax(T *input, int32_t input_len) {
   assert(input);
 
   T m = *std::max_element(input, input + input_len);
@@ -102,7 +102,7 @@ void log_softmax(T *input, int32_t input_len) {
 }
 
 template <class T>
-std::vector<int32_t> topk_index(const T *vec, int32_t size, int32_t topk) {
+std::vector<int32_t> TopkIndex(const T *vec, int32_t size, int32_t topk) {
   std::vector<int32_t> vec_index(size);
   std::iota(vec_index.begin(), vec_index.end(), 0);
 

+ 123 - 38
sherpa-ncnn/csrc/modified-beam-search-decoder.cc

@@ -18,6 +18,7 @@
  */
 #include "sherpa-ncnn/csrc/modified-beam-search-decoder.h"
 
+#include <algorithm>
 #include <string>
 #include <utility>
 
@@ -25,6 +26,74 @@
 
 namespace sherpa_ncnn {
 
+// @param in 1-D tensor of shape (encoder_dim,)
+// @param n Number of times to repeat
+// @return Return a 2-d tensor of shape (n, encoder_dim)
+//
+// TODO(fangjun): Remove this function
+// once
+// https://github.com/nihui/ncnn/tree/pnnx-ncnn-binary-broadcast
+// gets merged
+static ncnn::Mat RepeatEncoderOut(ncnn::Mat in, int32_t n) {
+  int32_t w = in.w;
+  ncnn::Mat out(w, n, sizeof(float));
+
+  const float *in_ptr = in;
+  float *out_ptr = out;
+
+  for (int32_t i = 0; i != n; ++i) {
+    std::copy(in_ptr, in_ptr + w, out_ptr);
+    out_ptr += w;
+  }
+
+  return out;
+}
+
+// Compute log_softmax in-place.
+//
+// The log_softmax of each row is computed.
+//
+// @param in_out A 2-D tensor
+static void LogSoftmax(ncnn::Mat *in_out) {
+  int32_t h = in_out->h;
+  int32_t w = in_out->w;
+  for (int32_t y = 0; y != h; ++y) {
+    float *p = in_out->row(y);
+    LogSoftmax(p, w);
+  }
+}
+
+// The decoder model contains an embedding layer, which only supports
+// 1-D output.
+// This is a wrapper to support 2-D decoder output.
+//
+// @param model_ The NN model.
+// @param decoder_input A 2-D tensor of shape (num_active_paths, context_size)
+// @return Return a 2-D tensor of shape (num_active_paths, decoder_dim)
+//
+// TODO(fangjun): Change Embed in ncnn to output 2-d tensors
+static ncnn::Mat RunDecoder2D(Model *model_, ncnn::Mat decoder_input) {
+  ncnn::Mat decoder_out;
+  int32_t h = decoder_input.h;
+
+  for (int32_t y = 0; y != h; ++y) {
+    ncnn::Mat decoder_input_t =
+        ncnn::Mat(decoder_input.w, decoder_input.row(y));
+
+    ncnn::Mat tmp = model_->RunDecoder(decoder_input_t);
+
+    if (y == 0) {
+      decoder_out = ncnn::Mat(tmp.w, h);
+    }
+
+    const float *ptr = tmp;
+    float *out_ptr = decoder_out.row(y);
+    std::copy(ptr, ptr + tmp.w, out_ptr);
+  }
+
+  return decoder_out;
+}
+
 void ModifiedBeamSearchDecoder::AcceptWaveform(const float sample_rate,
                                                const float *input_buffer,
                                                int32_t frames_per_buffer) {
@@ -32,11 +101,20 @@ void ModifiedBeamSearchDecoder::AcceptWaveform(const float sample_rate,
                                     frames_per_buffer);
 }
 
-void ModifiedBeamSearchDecoder::BuildDecoderInput(Hypothesis hyp) {
-  for (int32_t i = 0; i != context_size_; ++i) {
-    static_cast<int32_t *>(decoder_input_)[i] =
-        *(hyp.ys.end() - context_size_ + i);
+ncnn::Mat ModifiedBeamSearchDecoder::BuildDecoderInput(
+    const std::vector<Hypothesis> &hyps) const {
+  int32_t num_hyps = static_cast<int32_t>(hyps.size());
+
+  ncnn::Mat decoder_input(context_size_, num_hyps);
+  auto p = static_cast<int32_t *>(decoder_input);
+
+  for (const auto &hyp : hyps) {
+    const auto &ys = hyp.ys;
+    std::copy(ys.end() - context_size_, ys.end(), p);
+    p += context_size_;
   }
+
+  return decoder_input;
 }
 
 void ModifiedBeamSearchDecoder::ResetResult() {
@@ -50,45 +128,52 @@ void ModifiedBeamSearchDecoder::ResetResult() {
 void ModifiedBeamSearchDecoder::Decode() {
   while (feature_extractor_.NumFramesReady() - num_processed_ >= segment_) {
     ncnn::Mat features = feature_extractor_.GetFrames(num_processed_, segment_);
-    std::tie(encoder_out_, encoder_state_) =
+    ncnn::Mat encoder_out;
+    std::tie(encoder_out, encoder_state_) =
         model_->RunEncoder(features, encoder_state_);
 
     Hypotheses cur = std::move(result_.hyps);
-    /* encoder_out_.w == encoder_out_dim, encoder_out_.h == num_frames. */
-    for (int32_t t = 0; t != encoder_out_.h; ++t) {
-      std::vector<Hypothesis> prev;
-      for (int32_t i = 0; i != config_.num_active_paths && cur.Size(); ++i) {
-        auto cur_best_hyp = cur.GetMostProbable(true);
-        cur.Remove(cur_best_hyp);
-        prev.push_back(std::move(cur_best_hyp));
-      }
-      cur.clear();
-
-      for (const auto &h : prev) {
-        ncnn::Mat encoder_out_t(encoder_out_.w, encoder_out_.row(t));
-        BuildDecoderInput(h);
-        decoder_out_ = model_->RunDecoder(decoder_input_);
-        ncnn::Mat joiner_out = model_->RunJoiner(encoder_out_t, decoder_out_);
-        auto joiner_out_ptr = joiner_out.row(0);
-        log_softmax(joiner_out_ptr, joiner_out.w);
-
-        // update active_paths
-        auto topk =
-            topk_index(joiner_out_ptr, joiner_out.w, config_.num_active_paths);
-        for (int i = 0; i != topk.size(); ++i) {
-          Hypothesis new_hyp = h;
-          int32_t new_token = topk[i];
-          if (new_token != blank_id_) {
-            new_hyp.ys.push_back(new_token);
-            new_hyp.num_trailing_blanks = 0;
-          } else {
-            ++new_hyp.num_trailing_blanks;
-          }
-          new_hyp.log_prob += joiner_out_ptr[new_token];
-          cur.Add(std::move(new_hyp));
+    /* encoder_out.w == encoder_out_dim, encoder_out.h == num_frames. */
+    for (int32_t t = 0; t != encoder_out.h; ++t) {
+      std::vector<Hypothesis> prev =
+          cur.GetTopK(config_.num_active_paths, true);
+
+      cur.Clear();
+
+      ncnn::Mat decoder_input = BuildDecoderInput(prev);
+      ncnn::Mat decoder_out = RunDecoder2D(model_, decoder_input);
+      // decoder_out.w == decoder_dim
+      // decoder_out.h == num_active_paths
+
+      ncnn::Mat encoder_out_t(encoder_out.w, encoder_out.row(t));
+      encoder_out_t = RepeatEncoderOut(encoder_out_t, decoder_out.h);
+
+      ncnn::Mat joiner_out = model_->RunJoiner(encoder_out_t, decoder_out);
+      // joiner_out.w == vocab_size
+      // joiner_out.h == num_active_paths
+      LogSoftmax(&joiner_out);
+      auto topk =
+          TopkIndex(static_cast<float *>(joiner_out),
+                    joiner_out.w * joiner_out.h, config_.num_active_paths);
+
+      for (auto i : topk) {
+        int32_t hyp_index = i / joiner_out.w;
+        int32_t new_token = i % joiner_out.w;
+
+        const float *p = joiner_out.row(hyp_index);
+
+        Hypothesis new_hyp = prev[hyp_index];
+
+        if (new_token != blank_id_) {
+          new_hyp.ys.push_back(new_token);
+          new_hyp.num_trailing_blanks = 0;
+        } else {
+          ++new_hyp.num_trailing_blanks;
         }
+        new_hyp.log_prob += p[new_token];
+        cur.Add(std::move(new_hyp));
       }
-    }
+    }  // for (int32_t t = 0; t != encoder_out.h; ++t) {
 
     num_processed_ += offset_;
     result_.hyps = std::move(cur);

+ 1 - 5
sherpa-ncnn/csrc/modified-beam-search-decoder.h

@@ -42,7 +42,6 @@ class ModifiedBeamSearchDecoder : public Decoder {
         context_size_(model_->ContextSize()),
         segment_(model->Segment()),
         offset_(model_->Offset()),
-        decoder_input_(context_size_),
         num_processed_(0),
         endpoint_start_frame_(0),
         endpoint_(endpoint) {
@@ -65,7 +64,7 @@ class ModifiedBeamSearchDecoder : public Decoder {
   void InputFinished() override;
 
  private:
-  void BuildDecoderInput(Hypothesis hyp);
+  ncnn::Mat BuildDecoderInput(const std::vector<Hypothesis> &hyps) const;
 
   const DecoderConfig config_;
   Model *model_;
@@ -75,10 +74,7 @@ class ModifiedBeamSearchDecoder : public Decoder {
   const int32_t context_size_;
   const int32_t segment_;
   const int32_t offset_;
-  ncnn::Mat encoder_out_;
   std::vector<ncnn::Mat> encoder_state_;
-  ncnn::Mat decoder_input_;
-  ncnn::Mat decoder_out_;
   int32_t num_processed_;
   int32_t endpoint_start_frame_;
   const Endpoint *endpoint_;