瀏覽代碼

Use ALSA to read the microphone for Linux (#73)

* add alsa

* Use ALSA to read microphone for Linux
Fangjun Kuang 2 年之前
父節點
當前提交
5f5385c018

+ 7 - 0
CMakeLists.txt

@@ -40,6 +40,7 @@ message(STATUS "SHERPA_NCNN_ENABLE_JNI ${SHERPA_NCNN_ENABLE_JNI}")
 message(STATUS "SHERPA_NCNN_ENABLE_BINARY ${SHERPA_NCNN_ENABLE_BINARY}")
 message(STATUS "SHERPA_NCNN_ENABLE_TEST ${SHERPA_NCNN_ENABLE_TEST}")
 
+
 if(NOT CMAKE_BUILD_TYPE)
   message(STATUS "No CMAKE_BUILD_TYPE given, default to Release")
   set(CMAKE_BUILD_TYPE Release)
@@ -49,6 +50,12 @@ message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
 set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+include(CheckIncludeFileCXX)
+check_include_file_cxx(alsa/asoundlib.h SHERPA_NCNN_HAS_ALSA)
+if(SHERPA_NCNN_HAS_ALSA)
+  add_definitions(-DSHERPA_NCNN_ENABLE_ALSA=1)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
 
 include(kaldi-native-fbank)

+ 18 - 1
build-aarch64-linux-gnu.sh

@@ -13,11 +13,28 @@ set -x
 dir=build-aarch64-linux-gnu
 mkdir -p $dir
 cd $dir
+
+if [ ! -f alsa-lib/src/.libs/libasound.so ]; then
+  echo "Start to cross-compile alsa-lib"
+  if [ ! -d alsa-lib ]; then
+    git clone --depth 1 https://github.com/alsa-project/alsa-lib
+  fi
+  pushd alsa-lib
+  CC=aarch64-linux-gnu-gcc ./gitcompile --host=aarch64-linux-gnu
+  popd
+  echo "Finish cross-compiling alsa-lib"
+fi
+
+export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH
+export SHERPA_NCNN_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs
+
 cmake \
   -DCMAKE_INSTALL_PREFIX=./install \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
   ..
+cp -v $SHERPA_NCNN_ALSA_LIB_DIR/libasound.so* ./install/lib/
 
-make VERBOSE=1 -j4
+make VERBOSE=1 -j10
 make install/strip
+

+ 13 - 0
sherpa-ncnn/csrc/CMakeLists.txt

@@ -25,6 +25,18 @@ if(NOT SHERPA_NCNN_ENABLE_PYTHON)
     target_link_libraries(sherpa-ncnn PRIVATE sherpa-ncnn-core)
     install(TARGETS sherpa-ncnn DESTINATION bin)
 
+    if(SHERPA_NCNN_HAS_ALSA)
+      add_executable(sherpa-ncnn-alsa sherpa-ncnn-alsa.cc alsa.cc)
+      target_link_libraries(sherpa-ncnn-alsa PRIVATE sherpa-ncnn-core)
+
+      if(DEFINED ENV{SHERPA_NCNN_ALSA_LIB_DIR})
+        target_link_libraries(sherpa-ncnn-alsa PRIVATE -L$ENV{SHERPA_NCNN_ALSA_LIB_DIR} -lasound)
+      else()
+        target_link_libraries(sherpa-ncnn-alsa PRIVATE asound)
+      endif()
+      install(TARGETS sherpa-ncnn-alsa DESTINATION bin)
+    endif()
+
     if(SHERPA_NCNN_ENABLE_PORTAUDIO)
       add_executable(sherpa-ncnn-microphone
         sherpa-ncnn-microphone.cc
@@ -63,3 +75,4 @@ if(SHERPA_NCNN_ENABLE_TEST)
   add_executable(test-resample test-resample.cc)
   target_link_libraries(test-resample sherpa-ncnn-core)
 endif()
+

+ 162 - 0
sherpa-ncnn/csrc/alsa.cc

@@ -0,0 +1,162 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef SHERPA_NCNN_ENABLE_ALSA
+
+#include "sherpa-ncnn/csrc/alsa.h"
+
+#include "alsa/asoundlib.h"
+
+namespace sherpa_ncnn {
+
+void ToFloat(const std::vector<int16_t> &in, std::vector<float> *out) {
+  out->resize(in.size());
+  int32_t n = in.size();
+  for (int32_t i = 0; i != n; ++i) {
+    (*out)[i] = in[i] / 32768.;
+  }
+}
+
+Alsa::Alsa(const char *device_name) {
+  const char *kDeviceHelp = R"(
+Please use the command:
+
+  arecord -l
+
+to list all available devices. For instance, if the output is:
+
+**** List of CAPTURE Hardware Devices ****
+card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+
+and if you want to select card 3 and the device 0 on that card, please use:
+
+  hw:3,0
+
+  )";
+
+  int32_t err =
+      snd_pcm_open(&capture_handle_, device_name, SND_PCM_STREAM_CAPTURE, 0);
+  if (err) {
+    fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
+    fprintf(stderr, "%s\n", kDeviceHelp);
+    exit(-1);
+  }
+
+  snd_pcm_hw_params_t *hw_params;
+  snd_pcm_hw_params_alloca(&hw_params);
+
+  err = snd_pcm_hw_params_any(capture_handle_, hw_params);
+  if (err) {
+    fprintf(stderr, "Failed to initialize hw_params: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  err = snd_pcm_hw_params_set_access(capture_handle_, hw_params,
+                                     SND_PCM_ACCESS_RW_INTERLEAVED);
+  if (err) {
+    fprintf(stderr, "Failed to set access type: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  err = snd_pcm_hw_params_set_format(capture_handle_, hw_params,
+                                     SND_PCM_FORMAT_S16_LE);
+  if (err) {
+    fprintf(stderr, "Failed to set format: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  // mono
+  err = snd_pcm_hw_params_set_channels(capture_handle_, hw_params, 1);
+  if (err) {
+    fprintf(stderr, "Failed to set number of channels to 1. %s\n",
+            snd_strerror(err));
+    exit(-1);
+  }
+
+  uint32_t actual_sample_rate = expected_sample_rate_;
+
+  int32_t dir = 0;
+  err = snd_pcm_hw_params_set_rate_near(capture_handle_, hw_params,
+                                        &actual_sample_rate, &dir);
+  if (err) {
+    fprintf(stderr, "Failed to set sample rate to, %d: %s\n",
+            expected_sample_rate_, snd_strerror(err));
+    exit(-1);
+  }
+  actual_sample_rate_ = actual_sample_rate;
+
+  if (actual_sample_rate_ != expected_sample_rate_) {
+    fprintf(stderr, "Failed to set sample rate to %d\n", expected_sample_rate_);
+    fprintf(stderr, "Current sample rate to %d\n", actual_sample_rate_);
+    fprintf(stderr,
+            "Creating a resampler:\n"
+            "   in_sample_rate: %d\n"
+            "   output_sample_rate: %d\n",
+            actual_sample_rate_, expected_sample_rate_);
+
+    float min_freq = std::min(actual_sample_rate_, expected_sample_rate_);
+    float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+
+    int32_t lowpass_filter_width = 6;
+    resampler_ = std::make_unique<LinearResample>(
+        actual_sample_rate_, expected_sample_rate_, lowpass_cutoff,
+        lowpass_filter_width);
+  } else {
+    fprintf(stderr, "Current sample rate: %d\n", actual_sample_rate_);
+  }
+
+  err = snd_pcm_hw_params(capture_handle_, hw_params);
+  if (err) {
+    fprintf(stderr, "Failed to set hw params: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  err = snd_pcm_prepare(capture_handle_);
+  if (err) {
+    fprintf(stderr, "Failed to prepare for recording: %s\n", snd_strerror(err));
+    exit(-1);
+  }
+
+  fprintf(stderr, "Recording started!\n");
+}
+
+Alsa::~Alsa() { snd_pcm_close(capture_handle_); }
+
+const std::vector<float> &Alsa::Read(int32_t num_samples) {
+  samples_.resize(num_samples);
+
+  int32_t count =
+      snd_pcm_readi(capture_handle_, samples_.data(), samples_.size());
+
+  samples_.resize(count);
+
+  ToFloat(samples_, &samples1_);
+
+  if (!resampler_) {
+    return samples1_;
+  }
+
+  resampler_->Resample(samples1_.data(), samples_.size(), false, &samples2_);
+  return samples2_;
+}
+
+}  // namespace sherpa_ncnn
+
+#endif

+ 58 - 0
sherpa-ncnn/csrc/alsa.h

@@ -0,0 +1,58 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SHERPA_NCNN_CSRC_MICROPHONE_H_
+#define SHERPA_NCNN_CSRC_MICROPHONE_H_
+
+#include <memory>
+#include <vector>
+
+#include "alsa/asoundlib.h"
+#include "sherpa-ncnn/csrc/resample.h"
+
+namespace sherpa_ncnn {
+
+class Alsa {
+ public:
+  explicit Alsa(const char *device_name);
+  ~Alsa();
+
+  // This is a blocking read.
+  //
+  // @param num_samples  Number of samples to read.
+  //
+  // The returned value is valid until the next call to Read().
+  const std::vector<float> &Read(int32_t num_samples);
+
+  int32_t GetExpectedSampleRate() const { return expected_sample_rate_; }
+  int32_t GetActualSampleRate() const { return actual_sample_rate_; }
+
+ private:
+  snd_pcm_t *capture_handle_;
+  int32_t expected_sample_rate_ = 16000;
+  int32_t actual_sample_rate_;
+
+  std::unique_ptr<LinearResample> resampler_;
+  std::vector<int16_t> samples_;  // directly from the microphone
+  std::vector<float> samples1_;   // normalized version of samples_
+  std::vector<float> samples2_;   // possibly resampled from samples1_
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_MICROPHONE_H_

+ 169 - 0
sherpa-ncnn/csrc/sherpa-ncnn-alsa.cc

@@ -0,0 +1,169 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "sherpa-ncnn/csrc/alsa.h"
+#include "sherpa-ncnn/csrc/recognizer.h"
+
+bool stop = false;
+
+static void Handler(int sig) {
+  stop = true;
+  fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
+};
+
+int main(int32_t argc, char *argv[]) {
+  if (argc < 9 || argc > 11) {
+    const char *usage = R"usage(
+Usage:
+  ./bin/sherpa-ncnn-alsa \
+    /path/to/tokens.txt \
+    /path/to/encoder.ncnn.param \
+    /path/to/encoder.ncnn.bin \
+    /path/to/decoder.ncnn.param \
+    /path/to/decoder.ncnn.bin \
+    /path/to/joiner.ncnn.param \
+    /path/to/joiner.ncnn.bin \
+    device_name \
+    [num_threads] [decode_method, can be greedy_search/modified_beam_search]
+
+Please refer to
+https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
+for a list of pre-trained models to download.
+
+The device name specifies which microphone to use in case there are several
+on you system. You can use
+
+  arecord -l
+
+to find all available microphones on your computer. For instance, if it outputs
+
+**** List of CAPTURE Hardware Devices ****
+card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+
+and if you want to select card 3 and the device 0 on that card, please use:
+
+  hw:3,0
+
+as the device_name.
+)usage";
+
+    fprintf(stderr, "%s\n", usage);
+    fprintf(stderr, "argc, %d\n", argc);
+
+    return 0;
+  }
+
+  signal(SIGINT, Handler);
+
+  sherpa_ncnn::ModelConfig model_conf;
+  model_conf.tokens = argv[1];
+  model_conf.encoder_param = argv[2];
+  model_conf.encoder_bin = argv[3];
+  model_conf.decoder_param = argv[4];
+  model_conf.decoder_bin = argv[5];
+  model_conf.joiner_param = argv[6];
+  model_conf.joiner_bin = argv[7];
+
+  const char *device_name = argv[8];
+
+  int num_threads = 4;
+  if (argc >= 10 && atoi(argv[9]) > 0) {
+    num_threads = atoi(argv[9]);
+  }
+
+  model_conf.encoder_opt.num_threads = num_threads;
+  model_conf.decoder_opt.num_threads = num_threads;
+  model_conf.joiner_opt.num_threads = num_threads;
+
+  fprintf(stderr, "%s\n", model_conf.ToString().c_str());
+
+  sherpa_ncnn::DecoderConfig decoder_conf;
+  if (argc == 10) {
+    std::string method = argv[9];
+    if (method.compare("greedy_search") ||
+        method.compare("modified_beam_search")) {
+      decoder_conf.method = method;
+    }
+  }
+
+  decoder_conf.enable_endpoint = true;
+
+  sherpa_ncnn::EndpointConfig endpoint_config;
+  endpoint_config.rule1.min_trailing_silence = 2.4;
+  endpoint_config.rule2.min_trailing_silence = 1.2;
+  endpoint_config.rule3.min_utterance_length = 300;
+
+  decoder_conf.endpoint_config = endpoint_config;
+
+  fprintf(stderr, "%s\n", decoder_conf.ToString().c_str());
+
+  int32_t expected_sampling_rate = 16000;
+  knf::FbankOptions fbank_opts;
+  fbank_opts.frame_opts.dither = 0;
+  fbank_opts.frame_opts.snip_edges = false;
+  fbank_opts.frame_opts.samp_freq = expected_sampling_rate;
+  fbank_opts.mel_opts.num_bins = 80;
+
+  sherpa_ncnn::Recognizer recognizer(decoder_conf, model_conf, fbank_opts);
+  sherpa_ncnn::Alsa alsa(device_name);
+
+  if (alsa.GetExpectedSampleRate() != expected_sampling_rate) {
+    fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
+            expected_sampling_rate);
+    exit(-1);
+  }
+
+  int32_t chunk = 0.1 * alsa.GetActualSampleRate();
+
+  std::string last_text;
+  int32_t segment_index = 0;
+  while (!stop) {
+    const std::vector<float> samples = alsa.Read(chunk);
+
+    recognizer.AcceptWaveform(expected_sampling_rate, samples.data(),
+                              samples.size());
+    recognizer.Decode();
+    bool is_endpoint = recognizer.IsEndpoint();
+    auto text = recognizer.GetResult().text;
+
+    if (!text.empty() && last_text != text) {
+      last_text = text;
+
+      // If you want to display in lower case, please uncomment
+      // the followint two lines
+      // std::transform(text.begin(), text.end(), text.begin(),
+      //                [](auto c) { return std::tolower(c); });
+
+      fprintf(stderr, "%d: %s\n", segment_index, text.c_str());
+    }
+
+    if (!text.empty() && is_endpoint) {
+      ++segment_index;
+    }
+  }
+
+  return 0;
+}

+ 12 - 10
sherpa-ncnn/csrc/sherpa-ncnn-microphone.cc

@@ -26,11 +26,12 @@
 
 bool stop = false;
 
-static int RecordCallback(const void *input_buffer, void * /*output_buffer*/,
-                          unsigned long frames_per_buffer,  // NOLINT
-                          const PaStreamCallbackTimeInfo * /*time_info*/,
-                          PaStreamCallbackFlags /*status_flags*/,
-                          void *user_data) {
+static int32_t RecordCallback(const void *input_buffer,
+                              void * /*output_buffer*/,
+                              unsigned long frames_per_buffer,  // NOLINT
+                              const PaStreamCallbackTimeInfo * /*time_info*/,
+                              PaStreamCallbackFlags /*status_flags*/,
+                              void *user_data) {
   auto recognizer = reinterpret_cast<sherpa_ncnn::Recognizer *>(user_data);
 
   recognizer->AcceptWaveform(
@@ -39,12 +40,12 @@ static int RecordCallback(const void *input_buffer, void * /*output_buffer*/,
   return stop ? paComplete : paContinue;
 }
 
-static void Handler(int sig) {
+static void Handler(int32_t sig) {
   stop = true;
-  fprintf(stderr, "\nexiting...\n");
+  fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
 };
 
-int main(int32_t argc, char *argv[]) {
+int32_t main(int32_t argc, char *argv[]) {
   if (argc < 8 || argc > 10) {
     const char *usage = R"usage(
 Usage:
@@ -77,7 +78,7 @@ for a list of pre-trained models to download.
   model_conf.decoder_bin = argv[5];
   model_conf.joiner_param = argv[6];
   model_conf.joiner_bin = argv[7];
-  int num_threads = 4;
+  int32_t num_threads = 4;
   if (argc >= 9 && atoi(argv[8]) > 0) {
     num_threads = atoi(argv[8]);
   }
@@ -152,11 +153,12 @@ for a list of pre-trained models to download.
     exit(EXIT_FAILURE);
   }
 
-  int num_tokens = 0;
+  int32_t num_tokens = 0;
   while (!stop) {
     recognizer.Decode();
     auto result = recognizer.GetResult();
     if (result.text.size() != num_tokens) {
+      num_tokens = result.text.size();
       fprintf(stderr, "%s\n", result.text.c_str());
     }
 

+ 2 - 2
sherpa-ncnn/csrc/sherpa-ncnn.cc

@@ -24,7 +24,7 @@
 #include "sherpa-ncnn/csrc/recognizer.h"
 #include "sherpa-ncnn/csrc/wave-reader.h"
 
-int main(int argc, char *argv[]) {
+int32_t main(int32_t argc, char *argv[]) {
   if (argc < 9 || argc > 11) {
     const char *usage = R"usage(
 Usage:
@@ -54,7 +54,7 @@ for a list of pre-trained models to download.
   model_conf.decoder_bin = argv[5];
   model_conf.joiner_param = argv[6];
   model_conf.joiner_bin = argv[7];
-  int num_threads = 4;
+  int32_t num_threads = 4;
   if (argc >= 10 && atoi(argv[9]) > 0) {
     num_threads = atoi(argv[9]);
   }