Parcourir la source

Add Silero VAD version 4 (#353)

Fangjun Kuang il y a 11 mois
Parent
commit
f92325f041
37 fichiers modifiés avec 1976 ajouts et 116 suppressions
  1. 29 0
      .github/scripts/test-vad.sh
  2. 2 0
      .github/workflows/aarch64-linux-gnu.yaml
  3. 19 1
      .github/workflows/android.yaml
  4. 2 0
      .github/workflows/arm-linux-gnueabihf.yaml
  5. 1 1
      .github/workflows/build-wasm-simd.yaml
  6. 10 0
      .github/workflows/jni.yaml
  7. 27 1
      .github/workflows/linux.yaml
  8. 28 2
      .github/workflows/macos.yaml
  9. 8 0
      .github/workflows/npm.yaml
  10. 10 0
      .github/workflows/swift-api-test.yaml
  11. 4 7
      .github/workflows/test-dot-net.yaml
  12. 2 0
      .github/workflows/test-pip-install.yaml
  13. 9 0
      .github/workflows/wasm-simd-hf-space-en.yaml
  14. 9 0
      .github/workflows/wasm-simd-hf-space-zh-en.yaml
  15. 14 0
      .github/workflows/windows-x64.yaml
  16. 13 0
      .github/workflows/windows-x86.yaml
  17. 80 10
      README.md
  18. 19 19
      build-swift-macos.sh
  19. 1 1
      cmake/ncnn.cmake
  20. 14 0
      sherpa-ncnn/csrc/CMakeLists.txt
  21. 179 0
      sherpa-ncnn/csrc/circular-buffer.cc
  22. 75 0
      sherpa-ncnn/csrc/circular-buffer.h
  23. 37 0
      sherpa-ncnn/csrc/file-utils.cc
  24. 43 0
      sherpa-ncnn/csrc/file-utils.h
  25. 0 1
      sherpa-ncnn/csrc/model.h
  26. 139 0
      sherpa-ncnn/csrc/sherpa-ncnn-vad.cc
  27. 80 0
      sherpa-ncnn/csrc/silero-vad-model-config.cc
  28. 65 0
      sherpa-ncnn/csrc/silero-vad-model-config.h
  29. 284 0
      sherpa-ncnn/csrc/silero-vad-model.cc
  30. 76 0
      sherpa-ncnn/csrc/silero-vad-model.h
  31. 216 0
      sherpa-ncnn/csrc/voice-activity-detector.cc
  32. 73 0
      sherpa-ncnn/csrc/voice-activity-detector.h
  33. 259 64
      sherpa-ncnn/csrc/wave-reader.cc
  34. 12 7
      sherpa-ncnn/csrc/wave-reader.h
  35. 95 0
      sherpa-ncnn/csrc/wave-writer.cc
  36. 40 0
      sherpa-ncnn/csrc/wave-writer.h
  37. 2 2
      swift-api-examples/run-decode-file.sh

+ 29 - 0
.github/scripts/test-vad.sh

@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+echo "EXE is $EXE"
+echo "PATH: $PATH"
+
+which $EXE
+
+cd build
+
+curl -SL -O https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-silero-vad.tar.bz2
+tar xvf sherpa-ncnn-silero-vad.tar.bz2
+rm sherpa-ncnn-silero-vad.tar.bz2
+ls -lh sherpa-ncnn-silero-vad
+
+curl -SL -O https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/lei-jun-test.wav
+
+$EXE
+
+ls -lh *.wav
+rm -rfv sherpa-ncnn-*
+rm *.wav

+ 2 - 0
.github/workflows/aarch64-linux-gnu.yaml

@@ -23,6 +23,8 @@ on:
       - 'sherpa-ncnn/csrc/*'
       - 'toolchains/aarch64-linux-gnu.toolchain.cmake'
 
+  workflow_dispatch:
+
 concurrency:
   group: aarch64-linux-gnu-${{ github.ref }}
   cancel-in-progress: true

+ 19 - 1
.github/workflows/android.yaml

@@ -53,6 +53,11 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-android
+
       - name: Display NDK HOME
         shell: bash
         run: |
@@ -62,6 +67,9 @@ jobs:
       - name: build android arm64-v8a
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           export ANDROID_NDK=$ANDROID_NDK_LATEST_HOME
           ./build-android-arm64-v8a.sh
           mkdir -p jniLibs/arm64-v8a/
@@ -70,6 +78,9 @@ jobs:
       - name: build android armv7-eabi
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           export ANDROID_NDK=$ANDROID_NDK_LATEST_HOME
           ./build-android-armv7-eabi.sh
           mkdir -p ./jniLibs/armeabi-v7a/
@@ -78,6 +89,9 @@ jobs:
       - name: build android x86_64
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           export ANDROID_NDK=$ANDROID_NDK_LATEST_HOME
           ./build-android-x86-64.sh
           mkdir -p ./jniLibs/x86_64
@@ -86,6 +100,9 @@ jobs:
       - name: build android x86
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           export ANDROID_NDK=$ANDROID_NDK_LATEST_HOME
           ./build-android-x86.sh
           mkdir -p ./jniLibs/x86
@@ -117,10 +134,11 @@ jobs:
           git config --global user.email "csukuangfj@gmail.com"
           git config --global user.name "Fangjun Kuang"
 
+          export GIT_CLONE_PROTECTION_ACTIVE=false
           GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-ncnn-libs huggingface
 
           cd huggingface
-          git lfs pull
+          git pull
 
           cp -v ../sherpa-ncnn-*-android.tar.bz2 ./
 

+ 2 - 0
.github/workflows/arm-linux-gnueabihf.yaml

@@ -23,6 +23,8 @@ on:
       - 'sherpa-ncnn/csrc/*'
       - 'toolchains/arm-linux-gnueabihf.toolchain.cmake'
 
+  workflow_dispatch:
+
 concurrency:
   group: arm-linux-gnueabihf-${{ github.ref }}
   cancel-in-progress: true

+ 1 - 1
.github/workflows/build-wasm-simd.yaml

@@ -36,7 +36,7 @@ env:
     ${{ github.event.release.tag_name != '' || github.event.inputs.release == 'true' }}
 
 concurrency:
-  group: linux-${{ github.ref }}
+  group: build-wasm-simd-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:

+ 10 - 0
.github/workflows/jni.yaml

@@ -22,6 +22,8 @@ on:
       - 'sherpa-ncnn/jni/*'
       - '.github/scripts/test-jni.sh'
 
+  workflow_dispatch:
+
 concurrency:
   group: jni-${{ github.ref }}
   cancel-in-progress: true
@@ -42,6 +44,11 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-jni
+
       - name: Display kotlin version
         shell: bash
         run: |
@@ -56,4 +63,7 @@ jobs:
       - name:  Run JNI test
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           .github/scripts/test-jni.sh

+ 27 - 1
.github/workflows/linux.yaml

@@ -7,6 +7,7 @@ on:
     paths:
       - '.github/workflows/linux.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
@@ -16,11 +17,14 @@ on:
     paths:
       - '.github/workflows/linux.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
       - 'sherpa-ncnn/csrc/*'
 
+  workflow_dispatch:
+
 concurrency:
   group: linux-${{ github.ref }}
   cancel-in-progress: true
@@ -37,19 +41,31 @@ jobs:
         os: [ubuntu-latest]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-linux
+
       - name: Configure CMake
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+          cmake --version
+
           mkdir build
           cd build
           cmake -D CMAKE_BUILD_TYPE=Release -DSHERPA_NCNN_ENABLE_FFMPEG_EXAMPLES=OFF ..
 
       - name: Build sherpa-ncnn for ubuntu
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           cd build
           make -j2
 
@@ -59,6 +75,9 @@ jobs:
           ls -lh bin/sherpa-ncnn
           file bin/sherpa-ncnn
 
+          ls -lh bin/sherpa-ncnn-vad
+          file bin/sherpa-ncnn-vad
+
           ls -lh bin/sherpa-ncnn-microphone
           file bin/sherpa-ncnn-microphone
 
@@ -71,6 +90,13 @@ jobs:
           name: sherpa-ncnn-pre-built-binaries-os-${{ matrix.os }}
           path: ./build/bin
 
+      - name: Test vad
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-ncnn-vad
+
+          .github/scripts/test-vad.sh
+
       - name: Test sherpa-ncnn
         run: |
           export PATH=$PWD/build/bin:$PATH

+ 28 - 2
.github/workflows/macos.yaml

@@ -7,6 +7,7 @@ on:
     paths:
       - '.github/workflows/macos.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
@@ -16,10 +17,13 @@ on:
     paths:
       - '.github/workflows/macos.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
 
+  workflow_dispatch:
+
 concurrency:
   group: macos-${{ github.ref }}
   cancel-in-progress: true
@@ -33,22 +37,34 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [macos-latest]
+        os: [macos-latest, macos-13]
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-macos
+
       - name: Configure CMake
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+          cmake --version
+
           mkdir build
           cd build
           cmake -D CMAKE_BUILD_TYPE=Release ..
 
       - name: Build sherpa-ncnn for macos
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           cd build
           make -j2
 
@@ -58,6 +74,9 @@ jobs:
           ls -lh bin/sherpa-ncnn
           file bin/sherpa-ncnn
 
+          ls -lh bin/sherpa-ncnn-vad
+          file bin/sherpa-ncnn-vad
+
           ls -lh bin/sherpa-ncnn-microphone
           file bin/sherpa-ncnn-microphone
 
@@ -70,6 +89,13 @@ jobs:
           name: sherpa-ncnn-pre-built-binaries-os-${{ matrix.os }}
           path: ./build/bin
 
+      - name: Test vad
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-ncnn-vad
+
+          .github/scripts/test-vad.sh
+
       - name: Test sherpa-ncnn
         run: |
           export PATH=$PWD/build/bin:$PATH

+ 8 - 0
.github/workflows/npm.yaml

@@ -25,6 +25,11 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-npm
+
       - name: Install emsdk
         uses: mymindstorm/setup-emsdk@v14
 
@@ -59,6 +64,9 @@ jobs:
         env:
           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           ./build-wasm-simd-for-nodejs.sh
 
           cp -v build-wasm-simd-for-nodejs/install/bin/wasm/sherpa-ncnn-wasm-main.js ./scripts/nodejs

+ 10 - 0
.github/workflows/swift-api-test.yaml

@@ -24,6 +24,8 @@ on:
       - 'sherpa-ncnn/swift-api-examples/*'
       - 'build-swift-macos.sh'
 
+  workflow_dispatch:
+
 concurrency:
   group: swift-api-test-${{ github.ref }}
   cancel-in-progress: true
@@ -44,7 +46,15 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-swift
+
       - name: Run swift-api-test
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           .github/scripts/swift-api-test.sh

+ 4 - 7
.github/workflows/test-dot-net.yaml

@@ -15,6 +15,8 @@ on:
       - '.github/workflows/test-dot-net'
       - 'dotnet-examples/**'
 
+  workflow_dispatch:
+
   schedule:
     # minute (0-59)
     # hour (0-23)
@@ -25,7 +27,7 @@ on:
     - cron: "50 23 * * *"
 
 concurrency:
-  group: test-dot-net
+  group: test-dot-net-${{ github.ref }}
   cancel-in-progress: true
 
 permissions:
@@ -44,13 +46,8 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Setup .NET Core 3.1
-        uses: actions/setup-dotnet@v1
-        with:
-          dotnet-version: 3.1.x
-
       - name: Setup .NET 6.0
-        uses: actions/setup-dotnet@v1
+        uses: actions/setup-dotnet@v4
         with:
           dotnet-version: 6.0.x
 

+ 2 - 0
.github/workflows/test-pip-install.yaml

@@ -13,6 +13,8 @@ on:
     # nightly test at 22:50 UTC time every day
     - cron: "50 22 * * *"
 
+  workflow_dispatch:
+
 concurrency:
   group: test_pip_install-${{ github.ref }}
   cancel-in-progress: true

+ 9 - 0
.github/workflows/wasm-simd-hf-space-en.yaml

@@ -23,6 +23,12 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-wasm
+
       - name: Install emsdk
         uses: mymindstorm/setup-emsdk@v14
 
@@ -53,6 +59,9 @@ jobs:
       - name: Build sherpa-ncnn for WebAssembly
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           ./build-wasm-simd.sh
 
       - name: collect files

+ 9 - 0
.github/workflows/wasm-simd-hf-space-zh-en.yaml

@@ -23,6 +23,12 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-wasm
+
       - name: Install emsdk
         uses: mymindstorm/setup-emsdk@v14
 
@@ -53,6 +59,9 @@ jobs:
       - name: Build sherpa-ncnn for WebAssembly
         shell: bash
         run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
           ./build-wasm-simd.sh
 
       - name: collect files

+ 14 - 0
.github/workflows/windows-x64.yaml

@@ -7,6 +7,7 @@ on:
     paths:
       - '.github/workflows/windows-x64.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
@@ -16,10 +17,13 @@ on:
     paths:
       - '.github/workflows/windows-x64.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
 
+  workflow_dispatch:
+
 concurrency:
   group: windows-x64-${{ github.ref }}
   cancel-in-progress: true
@@ -70,8 +74,18 @@ jobs:
           cmake --build . --config Release -- -m:2
 
           ls -lh ./bin/Release/sherpa-ncnn.exe
+          ls -lh ./bin/Release/sherpa-ncnn-vad.exe
+          ls -lh ./bin/Release/decode-file-c-api.exe
           ls -lh ./bin/Release/sherpa-ncnn-microphone.exe
 
+      - name: Test VAD
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export EXE=sherpa-ncnn-vad.exe
+
+          .github/scripts/test-vad.sh
+
       - name: Test sherpa-ncnn
         shell: bash
         run: |

+ 13 - 0
.github/workflows/windows-x86.yaml

@@ -7,6 +7,7 @@ on:
     paths:
       - '.github/workflows/windows-x86.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
@@ -16,10 +17,13 @@ on:
     paths:
       - '.github/workflows/windows-x86.yaml'
       - '.github/scripts/run-test.sh'
+      - '.github/scripts/test-vad.sh'
       - 'CMakeLists.txt'
       - 'cmake/**'
       - 'sherpa-ncnn/csrc/*'
 
+  workflow_dispatch:
+
 concurrency:
   group: windows-x86-${{ github.ref }}
   cancel-in-progress: true
@@ -70,9 +74,18 @@ jobs:
           cmake --build . --config Release -- -m:2
 
           ls -lh ./bin/Release/sherpa-ncnn.exe
+          ls -lh ./bin/Release/sherpa-ncnn-vad.exe
           ls -lh ./bin/Release/decode-file-c-api.exe
           ls -lh ./bin/Release/sherpa-ncnn-microphone.exe
 
+      - name: Test VAD
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export EXE=sherpa-ncnn-vad.exe
+
+          .github/scripts/test-vad.sh
+
       - name: Test sherpa-ncnn
         shell: bash
         run: |

+ 80 - 10
README.md

@@ -1,14 +1,60 @@
-# Introduction
+### Supported functions
 
-You can use `sherpa-ncnn` for **real-time** speech recognition (i.e., speech-to-text)
-on
+|Real-time Speech recognition| Voice activity detection |
+|----------------------------|--------------------------|
+|   ✔️                        |         ✔️                |
 
-  - Linux
-  - macOS
-  - Windows
-  - Embedded Linux (32-bit arm and 64-bit aarch64)
-  - Android
-  - etc ...
+### Supported platforms
+
+|Architecture| Android          | iOS           | Windows    | macOS | linux |
+|------------|------------------|---------------|------------|-------|-------|
+|   x64      |  ✔️               |               |   ✔️        | ✔️     |  ✔️    |
+|   x86      |  ✔️               |               |   ✔️        |       |       |
+|   arm64    |  ✔️               | ✔️             |   ✔️        | ✔️     |  ✔️    |
+|   arm32    |  ✔️               |               |            |       |  ✔️    |
+|   riscv64  |                  |               |            |       |  ✔️    |
+
+### Supported programming languages
+
+| 1. C++ | 2. C  | 3. Python | 4. JavaScript |
+|--------|-------|-----------|---------------|
+|   ✔️    | ✔️     | ✔️         |    ✔️          |
+
+|5. Go   | 6. C# | 7. Kotlin | 8. Swift |
+|--------|-------|-----------|----------|
+| ✔️      |  ✔️    | ✔️         |  ✔️       |
+
+
+It also supports WebAssembly.
+
+## Introduction
+
+This repository supports running the following functions **locally**
+
+  - Streaming speech-to-text (i.e., real-time speech recognition)
+  - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad))
+
+on the following platforms and operating systems:
+
+  - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64)
+  - Linux, macOS, Windows, openKylin
+  - Android, WearOS
+  - iOS
+  - NodeJS
+  - WebAssembly
+  - [Raspberry Pi](https://www.raspberrypi.com/)
+  - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf)
+  - [LicheePi4A](https://sipeed.com/licheepi4a)
+  - [VisionFive 2](https://www.starfivetech.com/en/site/boards)
+  - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html)
+  - etc
+
+with the following APIs
+
+  - C++, C, Python, Go, ``C#``
+  - Kotlin
+  - JavaScript
+  - Swift
 
 We support all platforms that [ncnn](https://github.com/tencent/ncnn) supports.
 
@@ -40,5 +86,29 @@ with `sherpa-ncnn` using a microphone:
   - `Chinese` Android demo : <https://www.bilibili.com/video/BV1744y1Z76H>
   - `Chinese poem with background music` Android demo : <https://www.bilibili.com/video/BV1vR4y1k7eo>
 
+### Links for pre-built Android APKs
+
+| Description                    | URL                                                       |
+|--------------------------------|-----------------------------------------------------------|
+| Streaming speech recognition   | [Address](https://github.com/k2-fsa/sherpa-ncnn/releases) |
+
+### Links for pre-trained models
+
+https://github.com/k2-fsa/sherpa-ncnn/releases/tag/models
+
+### Useful links
+
+- Documentation: https://k2-fsa.github.io/sherpa/ncnn/
+- Bilibili 演示视频: https://search.bilibili.com/all?keyword=%E6%96%B0%E4%B8%80%E4%BB%A3Kaldi
+
+### How to reach us
+
+Please see
+https://k2-fsa.github.io/sherpa/social-groups.html
+for 新一代 Kaldi **微信交流群** and **QQ 交流群**.
+
+
+## See also
 
-See also <https://github.com/k2-fsa/sherpa>
+  - <https://github.com/k2-fsa/sherpa-onnx>
+  - <https://github.com/k2-fsa/sherpa>

+ 19 - 19
build-swift-macos.sh

@@ -18,16 +18,16 @@ if [ ! -d openmp-11.0.0.src ]; then
   popd
 fi
 
-if [ ! -f openmp-11.0.0.src/build-x86_64/install/include/omp.h ]; then
+if [ ! -f openmp-11.0.0.src/build/install/include/omp.h ]; then
   pushd openmp-11.0.0.src
 
-  mkdir -p build-x86_64
-  cd build-x86_64
+  mkdir -p build
+  cd build
 
   cmake \
     -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_INSTALL_PREFIX=install \
-    -DCMAKE_OSX_ARCHITECTURES="x86_64" \
+    -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
     -DLIBOMP_ENABLE_SHARED=OFF \
     -DLIBOMP_OMPT_SUPPORT=OFF \
     -DLIBOMP_USE_HWLOC=OFF ..
@@ -41,23 +41,23 @@ fi
 rm -rf  openmp.xcframework
 
 xcodebuild -create-xcframework \
-      -library "openmp-11.0.0.src/build-x86_64/install/lib/libomp.a" \
+      -library "openmp-11.0.0.src/build/install/lib/libomp.a" \
       -output openmp.xcframework
 
 mkdir -p openmp.xcframework/Headers
-cp -v openmp-11.0.0.src/build-x86_64/install/include/omp.h openmp.xcframework/Headers
+cp -v openmp-11.0.0.src/build/install/include/omp.h openmp.xcframework/Headers
 
-export CPLUS_INCLUDE_PATH=$PWD/openmp-11.0.0.src/build-x86_64/install/include:$CPLUS_INCLUDE_PATH
-mkdir -p build-x86_64
-pushd build-x86_64
+export CPLUS_INCLUDE_PATH=$PWD/openmp-11.0.0.src/build/install/include:$CPLUS_INCLUDE_PATH
+mkdir -p build
+pushd build
 
 cmake \
-  -DCMAKE_OSX_ARCHITECTURES="x86_64" \
+  -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
   -DOpenMP_C_FLAGS="-Xclang -fopenmp" \
   -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
   -DOpenMP_C_LIB_NAMES="libomp" \
   -DOpenMP_CXX_LIB_NAMES="libomp" \
-  -DOpenMP_libomp_LIBRARY="$PWD/../openmp-11.0.0.src/build-x86_64/install/lib/libomp.a" \
+  -DOpenMP_libomp_LIBRARY="$PWD/../openmp-11.0.0.src/build/install/lib/libomp.a" \
   \
   -DCMAKE_INSTALL_PREFIX=./install \
   -DCMAKE_BUILD_TYPE=Release \
@@ -81,20 +81,20 @@ popd
 
 rm -rf sherpa-ncnn.xcframework
 
-libtool -static -o ./build-x86_64/install/lib/sherpa-ncnn.a \
-  build-x86_64/install/lib/libncnn.a \
-  build-x86_64/install/lib/libsherpa-ncnn-c-api.a \
-  build-x86_64/install/lib/libsherpa-ncnn-core.a \
-  build-x86_64/install/lib/libkaldi-native-fbank-core.a
+libtool -static -o ./build/install/lib/sherpa-ncnn.a \
+  build/install/lib/libncnn.a \
+  build/install/lib/libsherpa-ncnn-c-api.a \
+  build/install/lib/libsherpa-ncnn-core.a \
+  build/install/lib/libkaldi-native-fbank-core.a
 
 xcodebuild -create-xcframework \
-      -library "build-x86_64/install/lib/sherpa-ncnn.a" \
+      -library "build/install/lib/sherpa-ncnn.a" \
       -output sherpa-ncnn.xcframework
 
 mkdir -p sherpa-ncnn.xcframework/Headers
-cp -av build-x86_64/install/include/* sherpa-ncnn.xcframework/Headers
+cp -av build/install/include/* sherpa-ncnn.xcframework/Headers
 
-pushd sherpa-ncnn.xcframework/macos-x86_64/
+pushd sherpa-ncnn.xcframework/macos-arm64_x86_64/
 ln -s sherpa-ncnn.a libsherpa-ncnn.a
 popd
 

+ 1 - 1
cmake/ncnn.cmake

@@ -112,7 +112,7 @@ function(download_ncnn)
     # UnaryOp
     ConvolutionDepthWise
     # Padding # required by innerproduct and convolution
-    Squeeze
+    # Squeeze
     # ExpandDims
     Normalize
     # Permute

+ 14 - 0
sherpa-ncnn/csrc/CMakeLists.txt

@@ -6,6 +6,7 @@ set(sherpa_ncnn_core_srcs
   decoder.cc
   endpoint.cc
   features.cc
+  file-utils.cc
   greedy-search-decoder.cc
   hypothesis.cc
   lstm-model.cc
@@ -21,8 +22,17 @@ set(sherpa_ncnn_core_srcs
   symbol-table.cc
   tensorasstrided.cc
   wave-reader.cc
+  wave-writer.cc
   zipformer-model.cc
 )
+
+list(APPEND sherpa_ncnn_core_srcs
+  circular-buffer.cc
+  silero-vad-model-config.cc
+  silero-vad-model.cc
+  voice-activity-detector.cc
+)
+
 add_library(sherpa-ncnn-core ${sherpa_ncnn_core_srcs})
 target_link_libraries(sherpa-ncnn-core PUBLIC kaldi-native-fbank-core ncnn)
 
@@ -38,6 +48,10 @@ if(NOT SHERPA_NCNN_ENABLE_PYTHON)
     target_link_libraries(sherpa-ncnn PRIVATE sherpa-ncnn-core)
     install(TARGETS sherpa-ncnn DESTINATION bin)
 
+    add_executable(sherpa-ncnn-vad sherpa-ncnn-vad.cc)
+    target_link_libraries(sherpa-ncnn-vad PRIVATE sherpa-ncnn-core)
+    install(TARGETS sherpa-ncnn-vad DESTINATION bin)
+
     if(SHERPA_NCNN_HAS_ALSA)
       add_executable(sherpa-ncnn-alsa sherpa-ncnn-alsa.cc alsa.cc)
       target_link_libraries(sherpa-ncnn-alsa PRIVATE sherpa-ncnn-core)

+ 179 - 0
sherpa-ncnn/csrc/circular-buffer.cc

@@ -0,0 +1,179 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sherpa-ncnn/csrc/circular-buffer.h"
+
+#include <algorithm>
+
+#include "platform.h"  // NOLINT
+
+namespace sherpa_ncnn {
+
+CircularBuffer::CircularBuffer(int32_t capacity) {
+  if (capacity <= 0) {
+    NCNN_LOGE("Please specify a positive capacity. Given: %d\n", capacity);
+    exit(-1);
+  }
+  buffer_.resize(capacity);
+}
+
+void CircularBuffer::Resize(int32_t new_capacity) {
+  int32_t capacity = static_cast<int32_t>(buffer_.size());
+  if (new_capacity <= capacity) {
+    NCNN_LOGE("new_capacity (%d) <= original capacity (%d). Skip it.",
+              new_capacity, capacity);
+    return;
+  }
+
+  int32_t size = Size();
+  if (size == 0) {
+    buffer_.resize(new_capacity);
+    return;
+  }
+
+  std::vector<float> new_buffer(new_capacity);
+  int32_t start = head_ % capacity;
+  int32_t dest = head_ % new_capacity;
+
+  if (start + size <= capacity) {
+    if (dest + size <= new_capacity) {
+      std::copy(buffer_.begin() + start, buffer_.begin() + start + size,
+                new_buffer.begin() + dest);
+    } else {
+      int32_t part1_size = new_capacity - dest;
+
+      // copy [start, start+part1_size] to new_buffer
+      std::copy(buffer_.begin() + start, buffer_.begin() + start + part1_size,
+                new_buffer.begin() + dest);
+
+      // copy [start+part1_size, start+size] to new_buffer
+      std::copy(buffer_.begin() + start + part1_size,
+                buffer_.begin() + start + size, new_buffer.begin());
+    }
+  } else {
+    int32_t part1_size = capacity - start;
+    int32_t part2_size = size - part1_size;
+
+    // copy [start, start+part1_size] to new_buffer
+    if (dest + part1_size <= new_capacity) {
+      std::copy(buffer_.begin() + start, buffer_.begin() + start + part1_size,
+                new_buffer.begin() + dest);
+    } else {
+      int32_t first_part = new_capacity - dest;
+      std::copy(buffer_.begin() + start, buffer_.begin() + start + first_part,
+                new_buffer.begin() + dest);
+
+      std::copy(buffer_.begin() + start + first_part,
+                buffer_.begin() + start + part1_size, new_buffer.begin());
+    }
+
+    int32_t new_dest = (dest + part1_size) % new_capacity;
+
+    if (new_dest + part2_size <= new_capacity) {
+      std::copy(buffer_.begin(), buffer_.begin() + part2_size,
+                new_buffer.begin() + new_dest);
+    } else {
+      int32_t first_part = new_capacity - new_dest;
+      std::copy(buffer_.begin(), buffer_.begin() + first_part,
+                new_buffer.begin() + new_dest);
+      std::copy(buffer_.begin() + first_part, buffer_.begin() + part2_size,
+                new_buffer.begin());
+    }
+  }
+  buffer_.swap(new_buffer);
+}
+
+void CircularBuffer::Push(const float *p, int32_t n) {
+  int32_t capacity = static_cast<int32_t>(buffer_.size());
+  int32_t size = Size();
+  if (n + size > capacity) {
+    int32_t new_capacity = std::max(capacity * 2, n + size);
+    NCNN_LOGE(
+        "Overflow! n: %d, size: %d, n+size: %d, capacity: %d. Increase "
+        "capacity to: %d",
+        n, size, n + size, capacity, new_capacity);
+    Resize(new_capacity);
+
+    capacity = new_capacity;
+  }
+
+  int32_t start = tail_ % capacity;
+
+  tail_ += n;
+
+  if (start + n < capacity) {
+    std::copy(p, p + n, buffer_.begin() + start);
+    return;
+  }
+
+  int32_t part1_size = capacity - start;
+
+  std::copy(p, p + part1_size, buffer_.begin() + start);
+
+  std::copy(p + part1_size, p + n, buffer_.begin());
+}
+
+std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
+  if (start_index < head_ || start_index >= tail_) {
+    NCNN_LOGE("Invalid start_index: %d. head_: %d, tail_: %d", start_index,
+              head_, tail_);
+    return {};
+  }
+
+  int32_t size = Size();
+  if (n < 0 || n > size) {
+    NCNN_LOGE("Invalid n: %d. size: %d", n, size);
+    return {};
+  }
+
+  int32_t capacity = static_cast<int32_t>(buffer_.size());
+
+  if (start_index - head_ + n > size) {
+    NCNN_LOGE("Invalid start_index: %d and n: %d. head_: %d, size: %d",
+              start_index, n, head_, size);
+    return {};
+  }
+
+  int32_t start = start_index % capacity;
+
+  if (start + n < capacity) {
+    return {buffer_.begin() + start, buffer_.begin() + start + n};
+  }
+
+  std::vector<float> ans(n);
+
+  std::copy(buffer_.begin() + start, buffer_.end(), ans.begin());
+
+  int32_t part1_size = capacity - start;
+  int32_t part2_size = n - part1_size;
+  std::copy(buffer_.begin(), buffer_.begin() + part2_size,
+            ans.begin() + part1_size);
+
+  return ans;
+}
+
+void CircularBuffer::Pop(int32_t n) {
+  int32_t size = Size();
+  if (n < 0 || n > size) {
+    NCNN_LOGE("Invalid n: %d. size: %d", n, size);
+    return;
+  }
+
+  head_ += n;
+}
+
+}  // namespace sherpa_ncnn

+ 75 - 0
sherpa-ncnn/csrc/circular-buffer.h

@@ -0,0 +1,75 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SHERPA_NCNN_CSRC_CIRCULAR_BUFFER_H_
+#define SHERPA_NCNN_CSRC_CIRCULAR_BUFFER_H_
+
+#include <cstdint>
+#include <vector>
+
+namespace sherpa_ncnn {
+
+class CircularBuffer {
+ public:
+  // Capacity of this buffer. Should be large enough.
+  // If it is full, we just print a message and exit the program.
+  explicit CircularBuffer(int32_t capacity);
+
+  // Push an array
+  //
+  // @param p Pointer to the start address of the array
+  // @param n Number of elements in the array
+  //
+  // Note: If n + Size() > capacity, we print an error message and exit.
+  void Push(const float *p, int32_t n);
+
+  // @param start_index Should in the range [head_, tail_)
+  // @param n Number of elements to get
+  // @return Return a vector of size n containing the requested elements
+  std::vector<float> Get(int32_t start_index, int32_t n) const;
+
+  // Remove n elements from the buffer
+  //
+  // @param n Should be in the range [0, size_]
+  void Pop(int32_t n);
+
+  // Number of elements in the buffer.
+  int32_t Size() const { return tail_ - head_; }
+
+  // Current position of the head
+  int32_t Head() const { return head_; }
+
+  // Current position of the tail
+  int32_t Tail() const { return tail_; }
+
+  void Reset() {
+    head_ = 0;
+    tail_ = 0;
+  }
+
+  void Resize(int32_t new_capacity);
+
+ private:
+  std::vector<float> buffer_;
+
+  int32_t head_ = 0;  // linear index; always increasing; never wraps around
+  int32_t tail_ = 0;  // linear index, always increasing; never wraps around.
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_CIRCULAR_BUFFER_H_

+ 37 - 0
sherpa-ncnn/csrc/file-utils.cc

@@ -0,0 +1,37 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sherpa-ncnn/csrc/file-utils.h"
+
+#include <fstream>
+#include <string>
+
+namespace sherpa_ncnn {
+
+bool FileExists(const std::string &filename) {
+  return std::ifstream(filename).good();
+}
+
+void AssertFileExists(const std::string &filename) {
+  if (!FileExists(filename)) {
+    NCNN_LOGE("filename '%s' does not exist", filename.c_str());
+    exit(-1);
+  }
+}
+
+}  // namespace sherpa_ncnn

+ 43 - 0
sherpa-ncnn/csrc/file-utils.h

@@ -0,0 +1,43 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SHERPA_NCNN_CSRC_FILE_UTILS_H_
+#define SHERPA_NCNN_CSRC_FILE_UTILS_H_
+
+#include <fstream>
+#include <string>
+
+#include "platform.h"  // for NCNN_LOGE, NOLINT
+
+namespace sherpa_ncnn {
+
+/** Check whether a given path is a file or not
+ *
+ * @param filename Path to check.
+ * @return Return true if the given path is a file; return false otherwise.
+ */
+bool FileExists(const std::string &filename);
+
+/** Abort if the file does not exist.
+ *
+ * @param filename The file to check.
+ */
+void AssertFileExists(const std::string &filename);
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_FILE_UTILS_H_

+ 0 - 1
sherpa-ncnn/csrc/model.h

@@ -132,7 +132,6 @@ class Model {
   // running the encoder network
   virtual int32_t Offset() const = 0;
 
- protected:
   static void InitNet(ncnn::Net &net, const std::string &param,
                       const std::string &bin);
 

+ 139 - 0
sherpa-ncnn/csrc/sherpa-ncnn-vad.cc

@@ -0,0 +1,139 @@
+/**
+ * Copyright (c)  2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "sherpa-ncnn/csrc/file-utils.h"
+#include "sherpa-ncnn/csrc/voice-activity-detector.h"
+#include "sherpa-ncnn/csrc/wave-reader.h"
+#include "sherpa-ncnn/csrc/wave-writer.h"
+
+int main() {
+  std::string usage = R"usage(
+This file shows how to use silero vad to remove silences from a file.
+
+===========Usage============:
+
+0. Build sherpa-ncnn
+--------------------
+
+mkdir -p $HOME/open-source
+cd $HOME/open-source
+git clone https://github.com/k2-fsa/sherpa-ncnn
+cd sherpa-ncnn
+mkdir build
+cd build
+cmake ..
+make -j3
+
+1. Download the vad model
+-------------------------
+
+cd $HOME/open-source/sherpa-ncnn/build
+wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-silero-vad.tar.bz2
+tar xvf sherpa-ncnn-silero-vad.tar.bz2
+
+2. Download the test data
+-------------------------
+
+cd $HOME/open-source/sherpa-ncnn/build
+wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/lei-jun-test.wav
+wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/Obama.wav
+
+3. Run it!
+----------
+
+cd $HOME/open-source/sherpa-ncnn/build
+./bin/sherpa-ncnn-vad
+
+**Note**: We only support 16000Hz wav files.
+  )usage";
+
+  sherpa_ncnn::SileroVadModelConfig config;
+  config.sample_rate = 16000;
+  config.param = "./sherpa-ncnn-silero-vad/silero.ncnn.param";
+  config.bin = "./sherpa-ncnn-silero-vad/silero.ncnn.bin";
+  config.window_size = 512;
+
+  if (!config.Validate()) {
+    fprintf(stderr, "%s %d: %s", __FILE__, static_cast<int32_t>(__LINE__),
+            usage.c_str());
+    return -1;
+  }
+
+  std::string input_wave = "./lei-jun-test.wav";
+  // std::string input_wave = "./Obama.wav";
+  if (!sherpa_ncnn::FileExists(input_wave)) {
+    fprintf(stderr, "%s %d: %s", __FILE__, static_cast<int32_t>(__LINE__),
+            usage.c_str());
+    return -1;
+  }
+
+  bool is_ok = false;
+  std::vector<float> samples =
+      sherpa_ncnn::ReadWave(input_wave, config.sample_rate, &is_ok);
+  if (!is_ok) {
+    fprintf(stderr, "%s %d: We support only %d wave files", __FILE__,
+            static_cast<int32_t>(__LINE__), config.sample_rate);
+    return -1;
+  }
+
+  sherpa_ncnn::VoiceActivityDetector vad(config);
+  int32_t num_samples = static_cast<int32_t>(samples.size());
+
+  std::vector<sherpa_ncnn::SpeechSegment> segments;
+
+  for (int32_t i = 0; i < samples.size(); i += config.window_size) {
+    vad.AcceptWaveform(samples.data() + i, config.window_size);
+    while (!vad.Empty()) {
+      const auto &front = vad.Front();
+      segments.push_back(front);
+
+      vad.Pop();
+    }
+  }
+
+  vad.Flush();
+  while (!vad.Empty()) {
+    const auto &front = vad.Front();
+    segments.push_back(front);
+
+    vad.Pop();
+  }
+
+  std::vector<float> all_samples;
+  for (const auto &s : segments) {
+    float start = s.start / static_cast<float>(config.sample_rate);
+    float duration = s.samples.size() / static_cast<float>(config.sample_rate);
+    float stop = start + duration;  // in seconds
+                                    //
+    fprintf(stderr, "%.3f -- %.3f s\n", start, start + duration);
+    all_samples.insert(all_samples.end(), s.samples.begin(), s.samples.end());
+  }
+
+  std::string out_wave = "./out-without-silence.wav";
+  is_ok = sherpa_ncnn::WriteWave(out_wave, config.sample_rate,
+                                 all_samples.data(), all_samples.size());
+  if (is_ok) {
+    fprintf(stderr, "Saved to %s\n", out_wave.c_str());
+  } else {
+    fprintf(stderr, "Failed to saved to %s\n", out_wave.c_str());
+  }
+
+  return 0;
+}

+ 80 - 0
sherpa-ncnn/csrc/silero-vad-model-config.cc

@@ -0,0 +1,80 @@
+/**
+ * Copyright (c)  2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sherpa-ncnn/csrc/silero-vad-model-config.h"
+
+#include <sstream>
+#include <string>
+
+#include "platform.h"  // for NCNN_LOGE, NOLINT
+#include "sherpa-ncnn/csrc/file-utils.h"
+
+namespace sherpa_ncnn {
+
+bool SileroVadModelConfig::Validate() const {
+  if (param.empty()) {
+    NCNN_LOGE("Please provide filename to silero.ncnn.param");
+    return false;
+  }
+
+  if (!FileExists(param)) {
+    NCNN_LOGE("'%s' does not exist", param.c_str());
+    return false;
+  }
+
+  if (bin.empty()) {
+    NCNN_LOGE("Please provide filename to silero.ncnn.bin");
+    return false;
+  }
+
+  if (!FileExists(bin)) {
+    NCNN_LOGE("'%s' does not exist", bin.c_str());
+    return false;
+  }
+
+  if (threshold < 0.01) {
+    NCNN_LOGE("Please use a larger value for threshold. Given: %f", threshold);
+    return false;
+  }
+
+  if (threshold >= 1) {
+    NCNN_LOGE("Please use a smaller value for threshold. Given: %f", threshold);
+    return false;
+  }
+
+  return true;
+}
+
+std::string SileroVadModelConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "SilerVadModelConfig(";
+  os << "param=\"" << param << "\", ";
+  os << "bin=\"" << bin << "\", ";
+  os << "threshold=" << threshold << ", ";
+  os << "min_silence_duration=" << min_silence_duration << ", ";
+  os << "min_speech_duration=" << min_speech_duration << ", ";
+  os << "window_size=" << window_size << ", ";
+  os << "sample_rate=" << sample_rate << ", ";
+  os << "use_vulkan_compute=" << (use_vulkan_compute ? "True" : "False")
+     << ", ";
+  os << "num_threads=" << opt.num_threads << ")";
+
+  return os.str();
+}
+
+}  // namespace sherpa_ncnn

+ 65 - 0
sherpa-ncnn/csrc/silero-vad-model-config.h

@@ -0,0 +1,65 @@
+/**
+ * Copyright (c)  2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_CONFIG_H_
+#define SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_CONFIG_H_
+
+#include <memory>
+#include <string>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#include "net.h"  // NOLINT
+
+namespace sherpa_ncnn {
+
+struct SileroVadModelConfig {
+  std::string param;  // path to silero.ncnn.param
+  std::string bin;    // path to silero.ncnn.bin
+
+  // threshold to classify a segment as speech
+  //
+  // If the predicted probability of a segment is larger than this
+  // value, then it is classified as speech.
+  float threshold = 0.5;
+
+  float min_silence_duration = 0.5;  // in seconds
+
+  float min_speech_duration = 0.25;  // in seconds
+
+  // 512, 1024, 1536 samples for 16000 Hz
+  // 256, 512, 768 samples for 800 Hz
+  int32_t window_size = 512;  // in samples
+
+  int32_t sample_rate = 16000;
+
+  bool use_vulkan_compute = true;
+
+  ncnn::Option opt;
+
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_CONFIG_H_

+ 284 - 0
sherpa-ncnn/csrc/silero-vad-model.cc

@@ -0,0 +1,284 @@
+/**
+ * Copyright (c)  2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sherpa-ncnn/csrc/silero-vad-model.h"
+
+#include <vector>
+
+#include "net.h"  // NOLINT
+#include "sherpa-ncnn/csrc/model.h"
+#include "sherpa-ncnn/csrc/silero-vad-model-config.h"
+
+namespace sherpa_ncnn {
+
+class SileroVadModel::Impl {
+ public:
+  explicit Impl(const SileroVadModelConfig &config) : config_(config) {
+    model_.opt = config.opt;
+    bool has_gpu = false;
+
+#if NCNN_VULKAN
+    has_gpu = ncnn::get_gpu_count() > 0;
+#endif
+
+    if (has_gpu && config_.use_vulkan_compute) {
+      model_.opt.use_vulkan_compute = true;
+      NCNN_LOGE("Use GPU");
+    }
+
+    Model::InitNet(model_, config_.param, config_.bin);
+    PostInit();
+  }
+
+#if __ANDROID_API__ >= 9
+  Impl(AAssetManager *mgr, const SileroVadModelConfig &config)
+      : config_(config) {
+    model_.opt = config.opt;
+    bool has_gpu = false;
+
+#if NCNN_VULKAN
+    has_gpu = ncnn::get_gpu_count() > 0;
+#endif
+
+    if (has_gpu && config_.use_vulkan_compute) {
+      model_.opt.use_vulkan_compute = true;
+      NCNN_LOGE("Use GPU");
+    }
+
+    Model::InitNet(mgr, model_, config_.param, config_.bin);
+
+    PostInit();
+  }
+#endif
+
+  void Reset() {
+    ResetV4();
+
+    triggered_ = false;
+    current_sample_ = 0;
+    temp_start_ = 0;
+    temp_end_ = 0;
+  }
+
+  bool IsSpeech(const float *samples, int32_t n) {
+    if (n != WindowSize()) {
+      NCNN_LOGE("n: %d != window_size: %d", n, WindowSize());
+      exit(-1);
+    }
+
+    float prob = Run(samples, n);
+
+    float threshold = config_.threshold;
+
+    current_sample_ += config_.window_size;
+
+    if (prob > threshold && temp_end_ != 0) {
+      temp_end_ = 0;
+    }
+
+    if (prob > threshold && temp_start_ == 0) {
+      // start speaking, but we require that it must satisfy
+      // min_speech_duration
+      temp_start_ = current_sample_;
+      return false;
+    }
+
+    if (prob > threshold && temp_start_ != 0 && !triggered_) {
+      if (current_sample_ - temp_start_ < min_speech_samples_) {
+        return false;
+      }
+
+      triggered_ = true;
+
+      return true;
+    }
+
+    if ((prob < threshold) && !triggered_) {
+      // silence
+      temp_start_ = 0;
+      temp_end_ = 0;
+      return false;
+    }
+
+    if ((prob > threshold - 0.15) && triggered_) {
+      // speaking
+      return true;
+    }
+
+    if ((prob > threshold) && !triggered_) {
+      // start speaking
+      triggered_ = true;
+
+      return true;
+    }
+
+    if ((prob < threshold) && triggered_) {
+      // stop to speak
+      if (temp_end_ == 0) {
+        temp_end_ = current_sample_;
+      }
+
+      if (current_sample_ - temp_end_ < min_silence_samples_) {
+        // continue speaking
+        return true;
+      }
+      // stopped speaking
+      temp_start_ = 0;
+      temp_end_ = 0;
+      triggered_ = false;
+      return false;
+    }
+
+    return false;
+  }
+
+  int32_t WindowShift() const { return config_.window_size; }
+
+  int32_t WindowSize() const { return config_.window_size; }
+
+  int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
+
+  int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
+
+  void SetMinSilenceDuration(float s) {
+    min_silence_samples_ = config_.sample_rate * s;
+  }
+
+  void SetThreshold(float threshold) { config_.threshold = threshold; }
+
+ private:
+  void PostInit() {
+    min_silence_samples_ = config_.sample_rate * config_.min_silence_duration;
+
+    min_speech_samples_ = config_.sample_rate * config_.min_speech_duration;
+
+    // input indexes map
+    // [0] -> in0, x
+    // [1] -> in1, h
+    // [2] -> in2, c
+    input_indexes_.resize(4);
+
+    // output indexes map
+    // [0] -> out0, prob
+    // [1] -> out1, h
+    // [2] -> out2, c
+    output_indexes_.resize(3);
+
+    const auto &blobs = model_.blobs();
+    for (int32_t i = 0; i != blobs.size(); ++i) {
+      const auto &b = blobs[i];
+      if (b.name == "in0") input_indexes_[0] = i;
+      if (b.name == "in1") input_indexes_[1] = i;
+      if (b.name == "in2") input_indexes_[2] = i;
+      if (b.name == "out0") output_indexes_[0] = i;
+      if (b.name == "out1") output_indexes_[1] = i;
+      if (b.name == "out2") output_indexes_[2] = i;
+    }
+
+    h_ = ncnn::Mat(64, 1, 2);
+    c_ = ncnn::Mat(64, 1, 2);
+
+    h_.fill(0);
+    c_.fill(0);
+  }
+
+  void ResetV4() {
+    h_.fill(0);
+    c_.fill(0);
+  }
+
+  float Run(const float *samples, int32_t n) {
+    // TODO(fangjun): Support V5
+    return RunV4(samples, n);
+  }
+
+  float RunV4(const float *samples, int32_t n) {
+    ncnn::Mat x(n, 1, 1, const_cast<float *>(samples));
+
+    ncnn::Extractor ex = model_.create_extractor();
+
+    ex.input(input_indexes_[0], x);
+    ex.input(input_indexes_[1], h_);
+    ex.input(input_indexes_[2], c_);
+
+    ncnn::Mat out;
+    ex.extract(output_indexes_[0], out);
+    ex.extract(output_indexes_[1], h_);
+    ex.extract(output_indexes_[2], c_);
+
+    float prob = out[0];
+    return prob;
+  }
+
+ private:
+  ncnn::Net model_;
+  std::vector<int32_t> input_indexes_;
+  std::vector<int32_t> output_indexes_;
+
+  ncnn::Mat h_;
+  ncnn::Mat c_;
+
+  SileroVadModelConfig config_;
+
+  int32_t min_silence_samples_;
+  int32_t min_speech_samples_;
+
+  bool triggered_ = false;
+  int32_t current_sample_ = 0;
+  int32_t temp_start_ = 0;
+  int32_t temp_end_ = 0;
+};
+
+SileroVadModel::SileroVadModel(const SileroVadModelConfig &config)
+    : impl_(std::make_unique<Impl>(config)) {}
+
+#if __ANDROID_API__ >= 9
+SileroVadModel::SileroVadModel(AAssetManager *mgr,
+                               const SileroVadModelConfig &config)
+    : impl_(std::make_unique<Impl>(mgr, config)) {}
+#endif
+
+SileroVadModel::~SileroVadModel() = default;
+
+void SileroVadModel::Reset() { return impl_->Reset(); }
+
+bool SileroVadModel::IsSpeech(const float *samples, int32_t n) {
+  return impl_->IsSpeech(samples, n);
+}
+
+int32_t SileroVadModel::WindowSize() const { return impl_->WindowSize(); }
+
+int32_t SileroVadModel::WindowShift() const { return impl_->WindowShift(); }
+
+int32_t SileroVadModel::MinSilenceDurationSamples() const {
+  return impl_->MinSilenceDurationSamples();
+}
+
+int32_t SileroVadModel::MinSpeechDurationSamples() const {
+  return impl_->MinSpeechDurationSamples();
+}
+
+void SileroVadModel::SetMinSilenceDuration(float s) {
+  impl_->SetMinSilenceDuration(s);
+}
+
+void SileroVadModel::SetThreshold(float threshold) {
+  impl_->SetThreshold(threshold);
+}
+
+}  // namespace sherpa_ncnn

+ 76 - 0
sherpa-ncnn/csrc/silero-vad-model.h

@@ -0,0 +1,76 @@
+/**
+ * Copyright (c)  2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_H_
+#define SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_H_
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#include <memory>
+
+#include "sherpa-ncnn/csrc/silero-vad-model-config.h"
+
+namespace sherpa_ncnn {
+
+class SileroVadModel {
+ public:
+  explicit SileroVadModel(const SileroVadModelConfig &config);
+
+#if __ANDROID_API__ >= 9
+  SileroVadModel(AAssetManager *mgr, const SileroVadModelConfig &config);
+#endif
+
+  ~SileroVadModel();
+
+  // reset the internal model states
+  void Reset();
+
+  /**
+   * @param samples Pointer to a 1-d array containing audio samples.
+   *                Each sample should be normalized to the range [-1, 1].
+   * @param n Number of samples.
+   *
+   * @return Return true if speech is detected. Return false otherwise.
+   */
+  bool IsSpeech(const float *samples, int32_t n);
+
+  // For silero vad V4, it is WindowShift().
+  // For silero vad V5, it is WindowShift()+64 for 16kHz and
+  //                          WindowShift()+32 for 8kHz
+  int32_t WindowSize() const;
+
+  // 512
+  int32_t WindowShift() const;
+
+  int32_t MinSilenceDurationSamples() const;
+  int32_t MinSpeechDurationSamples() const;
+
+  void SetMinSilenceDuration(float s);
+  void SetThreshold(float threshold);
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_SILERO_VAD_MODEL_H_

+ 216 - 0
sherpa-ncnn/csrc/voice-activity-detector.cc

@@ -0,0 +1,216 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sherpa-ncnn/csrc/voice-activity-detector.h"
+
+#include <algorithm>
+#include <queue>
+#include <utility>
+
+#include "sherpa-ncnn/csrc/circular-buffer.h"
+#include "sherpa-ncnn/csrc/silero-vad-model.h"
+
+namespace sherpa_ncnn {
+
+class VoiceActivityDetector::Impl {
+ public:
+  explicit Impl(const SileroVadModelConfig &config,
+                float buffer_size_in_seconds = 60)
+      : model_(std::make_unique<SileroVadModel>(config)),
+        config_(config),
+        buffer_(buffer_size_in_seconds * config.sample_rate) {}
+
+#if __ANDROID_API__ >= 9
+  Impl(AAssetManager *mgr, const SileroVadModelConfig &config,
+       float buffer_size_in_seconds = 60)
+      : model_(std::make_unique<SileroVadModel>(mgr, config)),
+        config_(config),
+        buffer_(buffer_size_in_seconds * config.sample_rate) {}
+#endif
+
+  void AcceptWaveform(const float *samples, int32_t n) {
+    if (buffer_.Size() > max_utterance_length_) {
+      model_->SetMinSilenceDuration(new_min_silence_duration_s_);
+      model_->SetThreshold(new_threshold_);
+    } else {
+      model_->SetMinSilenceDuration(config_.min_silence_duration);
+      model_->SetThreshold(config_.threshold);
+    }
+
+    int32_t window_size = model_->WindowSize();
+    int32_t window_shift = model_->WindowShift();
+
+    // note n is usually window_size and there is no need to use
+    // an extra buffer here
+    last_.insert(last_.end(), samples, samples + n);
+
+    if (last_.size() < window_size) {
+      return;
+    }
+
+    // Note: For v4, window_shift == window_size
+    int32_t k =
+        (static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
+    const float *p = last_.data();
+    bool is_speech = false;
+
+    for (int32_t i = 0; i < k; ++i, p += window_shift) {
+      buffer_.Push(p, window_shift);
+      // NOTE(fangjun): Please don't use a very large n.
+      bool this_window_is_speech = model_->IsSpeech(p, window_size);
+      is_speech = is_speech || this_window_is_speech;
+    }
+
+    last_ = std::vector<float>(
+        p, static_cast<const float *>(last_.data()) + last_.size());
+
+    if (is_speech) {
+      if (start_ == -1) {
+        // beginning of speech
+        start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
+                              model_->MinSpeechDurationSamples(),
+                          buffer_.Head());
+      }
+    } else {
+      // non-speech
+      if (start_ != -1 && buffer_.Size()) {
+        // end of speech, save the speech segment
+        int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
+
+        std::vector<float> s = buffer_.Get(start_, end - start_);
+        SpeechSegment segment;
+
+        segment.start = start_;
+        segment.samples = std::move(s);
+
+        segments_.push(std::move(segment));
+
+        buffer_.Pop(end - buffer_.Head());
+      }
+
+      if (start_ == -1) {
+        int32_t end = buffer_.Tail() - 2 * model_->WindowSize() -
+                      model_->MinSpeechDurationSamples();
+        int32_t n = std::max(0, end - buffer_.Head());
+        if (n > 0) {
+          buffer_.Pop(n);
+        }
+      }
+
+      start_ = -1;
+    }
+  }
+
+  bool Empty() const { return segments_.empty(); }
+
+  void Pop() { segments_.pop(); }
+
+  void Clear() { std::queue<SpeechSegment>().swap(segments_); }
+
+  const SpeechSegment &Front() const { return segments_.front(); }
+
+  void Reset() {
+    std::queue<SpeechSegment>().swap(segments_);
+
+    model_->Reset();
+    buffer_.Reset();
+
+    start_ = -1;
+  }
+
+  void Flush() {
+    if (start_ == -1 || buffer_.Size() == 0) {
+      return;
+    }
+
+    int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
+    if (end <= start_) {
+      return;
+    }
+
+    std::vector<float> s = buffer_.Get(start_, end - start_);
+
+    SpeechSegment segment;
+
+    segment.start = start_;
+    segment.samples = std::move(s);
+
+    segments_.push(std::move(segment));
+
+    buffer_.Pop(end - buffer_.Head());
+    start_ = -1;
+  }
+
+  bool IsSpeechDetected() const { return start_ != -1; }
+
+  const SileroVadModelConfig &GetConfig() const { return config_; }
+
+ private:
+  std::queue<SpeechSegment> segments_;
+
+  std::unique_ptr<SileroVadModel> model_;
+  SileroVadModelConfig config_;
+  CircularBuffer buffer_;
+  std::vector<float> last_;
+
+  int max_utterance_length_ = 16000 * 20;  // in samples
+  float new_min_silence_duration_s_ = 0.1;
+  float new_threshold_ = 1.10;
+
+  int32_t start_ = -1;
+};
+
+VoiceActivityDetector::VoiceActivityDetector(
+    const SileroVadModelConfig &config, float buffer_size_in_seconds /*= 60*/)
+    : impl_(std::make_unique<Impl>(config, buffer_size_in_seconds)) {}
+
+#if __ANDROID_API__ >= 9
+VoiceActivityDetector::VoiceActivityDetector(
+    AAssetManager *mgr, const SileroVadModelConfig &config,
+    float buffer_size_in_seconds /*= 60*/)
+    : impl_(std::make_unique<Impl>(mgr, config, buffer_size_in_seconds)) {}
+#endif
+
+VoiceActivityDetector::~VoiceActivityDetector() = default;
+
+void VoiceActivityDetector::AcceptWaveform(const float *samples, int32_t n) {
+  impl_->AcceptWaveform(samples, n);
+}
+
+bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }
+
+void VoiceActivityDetector::Pop() { impl_->Pop(); }
+
+void VoiceActivityDetector::Clear() { impl_->Clear(); }
+
+const SpeechSegment &VoiceActivityDetector::Front() const {
+  return impl_->Front();
+}
+
+void VoiceActivityDetector::Reset() const { impl_->Reset(); }
+
+void VoiceActivityDetector::Flush() const { impl_->Flush(); }
+
+bool VoiceActivityDetector::IsSpeechDetected() const {
+  return impl_->IsSpeechDetected();
+}
+
+const SileroVadModelConfig &VoiceActivityDetector::GetConfig() const {
+  return impl_->GetConfig();
+}
+
+}  // namespace sherpa_ncnn

+ 73 - 0
sherpa-ncnn/csrc/voice-activity-detector.h

@@ -0,0 +1,73 @@
+/**
+ * Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SHERPA_NCNN_CSRC_VOICE_ACTIVITY_DETECTOR_H_
+#define SHERPA_NCNN_CSRC_VOICE_ACTIVITY_DETECTOR_H_
+
+#include <memory>
+#include <vector>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#include "sherpa-ncnn/csrc/silero-vad-model-config.h"
+
+namespace sherpa_ncnn {
+
+struct SpeechSegment {
+  int32_t start;  // in samples
+  std::vector<float> samples;
+};
+
+class VoiceActivityDetector {
+ public:
+  explicit VoiceActivityDetector(const SileroVadModelConfig &config,
+                                 float buffer_size_in_seconds = 60);
+
+#if __ANDROID_API__ >= 9
+  VoiceActivityDetector(AAssetManager *mgr, const SileroVadModelConfig &config,
+                        float buffer_size_in_seconds = 60);
+#endif
+
+  ~VoiceActivityDetector();
+
+  void AcceptWaveform(const float *samples, int32_t n);
+  bool Empty() const;
+  void Pop();
+  void Clear();
+  const SpeechSegment &Front() const;
+
+  bool IsSpeechDetected() const;
+
+  void Reset() const;
+
+  // At the end of the utterance, you can invoke this method so that
+  // the last speech segment can be detected.
+  void Flush() const;
+
+  const SileroVadModelConfig &GetConfig() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_VOICE_ACTIVITY_DETECTOR_H_

+ 259 - 64
sherpa-ncnn/csrc/wave-reader.cc

@@ -1,5 +1,5 @@
 /**
- * Copyright      2021  Xiaomi Corporation (authors: Fangjun Kuang)
+ * Copyright (c)  2022-2024  Xiaomi Corporation (authors: Fangjun Kuang)
  *
  * See LICENSE for clarification regarding multiple authors
  *
@@ -20,10 +20,11 @@
 
 #include <cassert>
 #include <fstream>
-#include <iostream>
 #include <utility>
 #include <vector>
 
+#include "platform.h"  // NOLINT
+
 namespace sherpa_ncnn {
 namespace {
 // see http://soundfile.sapp.org/doc/WaveFormat/
@@ -31,53 +32,12 @@ namespace {
 // Note: We assume little endian here
 // TODO(fangjun): Support big endian
 struct WaveHeader {
-  bool Validate() const {
-    //                 F F I R
-    if (chunk_id != 0x46464952) {
-      return false;
-    }
-    //               E V A W
-    if (format != 0x45564157) {
-      return false;
-    }
-
-    //                       t m f
-    if (subchunk1_id != 0x20746d66) {
-      return false;
-    }
-
-    if (subchunk1_size != 16) {  // 16 for PCM
-      return false;
-    }
-
-    if (audio_format != 1) {  // 1 for PCM
-      return false;
-    }
-
-    if (num_channels != 1) {  // we support only single channel for now
-      return false;
-    }
-    if (byte_rate != (sample_rate * num_channels * bits_per_sample / 8)) {
-      return false;
-    }
-
-    if (block_align != (num_channels * bits_per_sample / 8)) {
-      return false;
-    }
-
-    if (bits_per_sample != 16) {  // we support only 16 bits per sample
-      return false;
-    }
-
-    return true;
-  }
-
   // See
   // https://en.wikipedia.org/wiki/WAV#Metadata
   // and
   // https://www.robotplanet.dk/audio/wav_meta_data/riff_mci.pdf
   void SeekToDataChunk(std::istream &is) {
-    //                        a t a d
+    //                              a t a d
     while (is && subchunk2_id != 0x61746164) {
       // const char *p = reinterpret_cast<const char *>(&subchunk2_id);
       // printf("Skip chunk (%x): %c%c%c%c of size: %d\n", subchunk2_id, p[0],
@@ -104,46 +64,255 @@ struct WaveHeader {
 };
 static_assert(sizeof(WaveHeader) == 44, "");
 
+/*
+sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav
+
+sox int16-1-channel-zh.wav -c 2 int16-2-channel-zh.wav
+
+we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav
+because sox uses WAVE_FORMAT_EXTENSIBLE, which is not easy to support
+in sherpa-ncnn.
+ */
+
 // Read a wave file of mono-channel.
 // Return its samples normalized to the range [-1, 1).
-std::vector<float> ReadWaveImpl(std::istream &is, float expected_sample_rate,
+std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
                                 bool *is_ok) {
-  WaveHeader header;
-  is.read(reinterpret_cast<char *>(&header), sizeof(header));
-  if (!is) {
+  WaveHeader header{};
+  is.read(reinterpret_cast<char *>(&header.chunk_id), sizeof(header.chunk_id));
+
+  //                        F F I R
+  if (header.chunk_id != 0x46464952) {
+    NCNN_LOGE("Expected chunk_id RIFF. Given: 0x%08x\n", header.chunk_id);
+    *is_ok = false;
+    return {};
+  }
+
+  is.read(reinterpret_cast<char *>(&header.chunk_size),
+          sizeof(header.chunk_size));
+
+  is.read(reinterpret_cast<char *>(&header.format), sizeof(header.format));
+
+  //                      E V A W
+  if (header.format != 0x45564157) {
+    NCNN_LOGE("Expected format WAVE. Given: 0x%08x\n", header.format);
     *is_ok = false;
     return {};
   }
 
-  if (!header.Validate()) {
+  is.read(reinterpret_cast<char *>(&header.subchunk1_id),
+          sizeof(header.subchunk1_id));
+
+  is.read(reinterpret_cast<char *>(&header.subchunk1_size),
+          sizeof(header.subchunk1_size));
+
+  if (header.subchunk1_id == 0x4b4e554a) {
+    // skip junk padding
+    is.seekg(header.subchunk1_size, std::istream::cur);
+
+    is.read(reinterpret_cast<char *>(&header.subchunk1_id),
+            sizeof(header.subchunk1_id));
+
+    is.read(reinterpret_cast<char *>(&header.subchunk1_size),
+            sizeof(header.subchunk1_size));
+  }
+
+  if (header.subchunk1_id != 0x20746d66) {
+    NCNN_LOGE("Expected subchunk1_id 0x20746d66. Given: 0x%08x\n",
+              header.subchunk1_id);
     *is_ok = false;
     return {};
   }
 
-  header.SeekToDataChunk(is);
-  if (!is) {
+  // NAudio uses 18
+  // See https://github.com/naudio/NAudio/issues/1132
+  if (header.subchunk1_size != 16 &&
+      header.subchunk1_size != 18) {  // 16 for PCM
+    NCNN_LOGE("Expected subchunk1_size 16. Given: %d\n", header.subchunk1_size);
     *is_ok = false;
     return {};
   }
 
-  if (expected_sample_rate != header.sample_rate) {
+  is.read(reinterpret_cast<char *>(&header.audio_format),
+          sizeof(header.audio_format));
+
+  if (header.audio_format != 1 && header.audio_format != 3) {
+    // 1 for integer PCM
+    // 3 for floating point PCM
+    // see https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    // and https://github.com/microsoft/DirectXTK/wiki/Wave-Formats
+    NCNN_LOGE("Expected audio_format 1. Given: %d\n", header.audio_format);
+
+    if (header.audio_format == static_cast<int16_t>(0xfffe)) {
+      NCNN_LOGE("We don't support WAVE_FORMAT_EXTENSIBLE files.");
+    }
+
     *is_ok = false;
     return {};
   }
 
-  // header.subchunk2_size contains the number of bytes in the data.
-  // As we assume each sample contains two bytes, so it is divided by 2 here
-  std::vector<int16_t> samples(header.subchunk2_size / 2);
+  is.read(reinterpret_cast<char *>(&header.num_channels),
+          sizeof(header.num_channels));
+
+  if (header.num_channels != 1) {  // we support only single channel for now
+    NCNN_LOGE(
+        "Warning: %d channels are found. We only use the first channel.\n",
+        header.num_channels);
+  }
+
+  is.read(reinterpret_cast<char *>(&header.sample_rate),
+          sizeof(header.sample_rate));
 
-  is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+  is.read(reinterpret_cast<char *>(&header.byte_rate),
+          sizeof(header.byte_rate));
+
+  is.read(reinterpret_cast<char *>(&header.block_align),
+          sizeof(header.block_align));
+
+  is.read(reinterpret_cast<char *>(&header.bits_per_sample),
+          sizeof(header.bits_per_sample));
+
+  if (header.byte_rate !=
+      (header.sample_rate * header.num_channels * header.bits_per_sample / 8)) {
+    NCNN_LOGE("Incorrect byte rate: %d. Expected: %d", header.byte_rate,
+              (header.sample_rate * header.num_channels *
+               header.bits_per_sample / 8));
+    *is_ok = false;
+    return {};
+  }
+
+  if (header.block_align !=
+      (header.num_channels * header.bits_per_sample / 8)) {
+    NCNN_LOGE("Incorrect block align: %d. Expected: %d\n", header.block_align,
+              (header.num_channels * header.bits_per_sample / 8));
+    *is_ok = false;
+    return {};
+  }
+
+  if (header.bits_per_sample != 8 && header.bits_per_sample != 16 &&
+      header.bits_per_sample != 32) {
+    NCNN_LOGE("Expected bits_per_sample 8, 16 or 32. Given: %d\n",
+              header.bits_per_sample);
+    *is_ok = false;
+    return {};
+  }
+
+  if (header.subchunk1_size == 18) {
+    // this is for NAudio. It puts extra bytes after bits_per_sample
+    // See
+    // https://github.com/naudio/NAudio/blob/master/NAudio.Core/Wave/WaveFormats/WaveFormat.cs#L223
+
+    int16_t extra_size = -1;
+    is.read(reinterpret_cast<char *>(&extra_size), sizeof(int16_t));
+    if (extra_size != 0) {
+      NCNN_LOGE(
+          "Extra size should be 0 for wave from NAudio. Current extra size "
+          "%d\n",
+          extra_size);
+      *is_ok = false;
+      return {};
+    }
+  }
+
+  is.read(reinterpret_cast<char *>(&header.subchunk2_id),
+          sizeof(header.subchunk2_id));
+
+  is.read(reinterpret_cast<char *>(&header.subchunk2_size),
+          sizeof(header.subchunk2_size));
+
+  header.SeekToDataChunk(is);
   if (!is) {
     *is_ok = false;
     return {};
   }
 
-  std::vector<float> ans(samples.size());
-  for (int32_t i = 0; i != ans.size(); ++i) {
-    ans[i] = samples[i] / 32768.;
+  *sampling_rate = header.sample_rate;
+
+  std::vector<float> ans;
+
+  if (header.bits_per_sample == 16 && header.audio_format == 1) {
+    // header.subchunk2_size contains the number of bytes in the data.
+    // As we assume each sample contains two bytes, so it is divided by 2 here
+    std::vector<int16_t> samples(header.subchunk2_size / 2);
+
+    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+    if (!is) {
+      NCNN_LOGE("Failed to read %d bytes", header.subchunk2_size);
+      *is_ok = false;
+      return {};
+    }
+
+    ans.resize(samples.size() / header.num_channels);
+
+    // samples are interleaved
+    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
+      ans[i] = samples[i * header.num_channels] / 32768.;
+    }
+  } else if (header.bits_per_sample == 8 && header.audio_format == 1) {
+    // number of samples == number of bytes for 8-bit encoded samples
+    //
+    // For 8-bit encoded samples, they are unsigned!
+    std::vector<uint8_t> samples(header.subchunk2_size);
+
+    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+    if (!is) {
+      NCNN_LOGE("Failed to read %d bytes", header.subchunk2_size);
+      *is_ok = false;
+      return {};
+    }
+
+    ans.resize(samples.size() / header.num_channels);
+    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
+      // Note(fangjun): We want to normalize each sample into the range [-1, 1]
+      // Since each original sample is in the range [0, 256], dividing
+      // them by 128 converts them to the range [0, 2];
+      // so after subtracting 1, we get the range [-1, 1]
+      //
+      ans[i] = samples[i * header.num_channels] / 128. - 1;
+    }
+  } else if (header.bits_per_sample == 32 && header.audio_format == 1) {
+    // 32 here is for int32
+    //
+    // header.subchunk2_size contains the number of bytes in the data.
+    // As we assume each sample contains 4 bytes, so it is divided by 4 here
+    std::vector<int32_t> samples(header.subchunk2_size / 4);
+
+    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+    if (!is) {
+      NCNN_LOGE("Failed to read %d bytes", header.subchunk2_size);
+      *is_ok = false;
+      return {};
+    }
+
+    ans.resize(samples.size() / header.num_channels);
+    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
+      ans[i] = static_cast<float>(samples[i * header.num_channels]) / (1 << 31);
+    }
+  } else if (header.bits_per_sample == 32 && header.audio_format == 3) {
+    // 32 here is for float32
+    //
+    // header.subchunk2_size contains the number of bytes in the data.
+    // As we assume each sample contains 4 bytes, so it is divided by 4 here
+    std::vector<float> samples(header.subchunk2_size / 4);
+
+    is.read(reinterpret_cast<char *>(samples.data()), header.subchunk2_size);
+    if (!is) {
+      NCNN_LOGE("Failed to read %d bytes", header.subchunk2_size);
+      *is_ok = false;
+      return {};
+    }
+
+    ans.resize(samples.size() / header.num_channels);
+    for (int32_t i = 0; i != static_cast<int32_t>(ans.size()); ++i) {
+      ans[i] = samples[i * header.num_channels];
+    }
+  } else {
+    NCNN_LOGE(
+        "Unsupported %d bits per sample and audio format: %d. Supported values "
+        "are: 8, 16, 32.",
+        header.bits_per_sample, header.audio_format);
+    *is_ok = false;
+    return {};
   }
 
   *is_ok = true;
@@ -152,15 +321,41 @@ std::vector<float> ReadWaveImpl(std::istream &is, float expected_sample_rate,
 
 }  // namespace
 
-std::vector<float> ReadWave(const std::string &filename,
-                            float expected_sample_rate, bool *is_ok) {
+std::vector<float> ReadWave(const std::string &filename, int32_t *sampling_rate,
+                            bool *is_ok) {
   std::ifstream is(filename, std::ifstream::binary);
-  return ReadWave(is, expected_sample_rate, is_ok);
+  return ReadWave(is, sampling_rate, is_ok);
 }
 
-std::vector<float> ReadWave(std::istream &is, float expected_sample_rate,
+std::vector<float> ReadWave(const std::string &filename,
+                            int32_t expected_sampling_rate, bool *is_ok) {
+  int32_t sampling_rate = -1;
+  auto samples = ReadWave(filename, &sampling_rate, is_ok);
+  if (*is_ok && expected_sampling_rate != sampling_rate) {
+    *is_ok = false;
+    NCNN_LOGE("Expected sample rate: %d, actual sample rate: %d",
+              expected_sampling_rate, sampling_rate);
+    samples.clear();
+  }
+  return samples;
+}
+
+std::vector<float> ReadWave(std::istream &is, int32_t *sampling_rate,
                             bool *is_ok) {
-  auto samples = ReadWaveImpl(is, expected_sample_rate, is_ok);
+  auto samples = ReadWaveImpl(is, sampling_rate, is_ok);
+  return samples;
+}
+
+std::vector<float> ReadWave(std::istream &is, int32_t expected_sampling_rate,
+                            bool *is_ok) {
+  int32_t sampling_rate = -1;
+  auto samples = ReadWave(is, &sampling_rate, is_ok);
+  if (*is_ok && expected_sampling_rate != sampling_rate) {
+    *is_ok = false;
+    NCNN_LOGE("Expected sample rate: %d, actual sample rate: %d",
+              expected_sampling_rate, sampling_rate);
+    samples.clear();
+  }
   return samples;
 }
 

+ 12 - 7
sherpa-ncnn/csrc/wave-reader.h

@@ -1,5 +1,5 @@
 /**
- * Copyright      2021  Xiaomi Corporation (authors: Fangjun Kuang)
+ * Copyright (c)  2022-2024  Xiaomi Corporation (authors: Fangjun Kuang)
  *
  * See LICENSE for clarification regarding multiple authors
  *
@@ -15,7 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef SHERPA_NCNN_CSRC_WAVE_READER_H_
 #define SHERPA_NCNN_CSRC_WAVE_READER_H_
 
@@ -27,17 +26,23 @@ namespace sherpa_ncnn {
 
 /** Read a wave file with expected sample rate.
 
-    @param filename Path to a wave file. It MUST be single channel, PCM encoded.
-    @param expected_sample_rate  Expected sample rate of the wave file. If the
-                               sample rate don't match, it throws an exception.
+    @param filename Path to a wave file. It MUST be single channel, 16-bit
+                    PCM encoded.
+    @param sampling_rate  On return, it contains the sampling rate of the file.
     @param is_ok On return it is true if the reading succeeded; false otherwise.
 
     @return Return wave samples normalized to the range [-1, 1).
  */
+std::vector<float> ReadWave(const std::string &filename, int32_t *sampling_rate,
+                            bool *is_ok);
+
+std::vector<float> ReadWave(std::istream &is, int32_t *sampling_rate,
+                            bool *is_ok);
+
 std::vector<float> ReadWave(const std::string &filename,
-                            float expected_sample_rate, bool *is_ok);
+                            int32_t expected_sampling_rate, bool *is_ok);
 
-std::vector<float> ReadWave(std::istream &is, float expected_sample_rate,
+std::vector<float> ReadWave(std::istream &is, int32_t expected_sampling_rate,
                             bool *is_ok);
 
 }  // namespace sherpa_ncnn

+ 95 - 0
sherpa-ncnn/csrc/wave-writer.cc

@@ -0,0 +1,95 @@
+/**
+ * Copyright (c)  2022-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sherpa-ncnn/csrc/wave-writer.h"
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "platform.h"  //NOLINT
+
+namespace sherpa_ncnn {
+namespace {
+
+// see http://soundfile.sapp.org/doc/WaveFormat/
+//
+// Note: We assume little endian here
+// TODO(fangjun): Support big endian
+struct WaveHeader {
+  int32_t chunk_id;
+  int32_t chunk_size;
+  int32_t format;
+  int32_t subchunk1_id;
+  int32_t subchunk1_size;
+  int16_t audio_format;
+  int16_t num_channels;
+  int32_t sample_rate;
+  int32_t byte_rate;
+  int16_t block_align;
+  int16_t bits_per_sample;
+  int32_t subchunk2_id;    // a tag of this chunk
+  int32_t subchunk2_size;  // size of subchunk2
+};
+
+}  // namespace
+
+bool WriteWave(const std::string &filename, int32_t sampling_rate,
+               const float *samples, int32_t n) {
+  WaveHeader header{};
+  header.chunk_id = 0x46464952;      // FFIR
+  header.format = 0x45564157;        // EVAW
+  header.subchunk1_id = 0x20746d66;  // "fmt "
+  header.subchunk1_size = 16;        // 16 for PCM
+  header.audio_format = 1;           // PCM =1
+
+  int32_t num_channels = 1;
+  int32_t bits_per_sample = 16;  // int16_t
+  header.num_channels = num_channels;
+  header.sample_rate = sampling_rate;
+  header.byte_rate = sampling_rate * num_channels * bits_per_sample / 8;
+  header.block_align = num_channels * bits_per_sample / 8;
+  header.bits_per_sample = bits_per_sample;
+  header.subchunk2_id = 0x61746164;  // atad
+  header.subchunk2_size = n * num_channels * bits_per_sample / 8;
+
+  header.chunk_size = 36 + header.subchunk2_size;
+
+  std::vector<int16_t> samples_int16(n);
+  for (int32_t i = 0; i != n; ++i) {
+    samples_int16[i] = samples[i] * 32676;
+  }
+
+  std::ofstream os(filename, std::ios::binary);
+  if (!os) {
+    NCNN_LOGE("Failed to create %s", filename.c_str());
+    return false;
+  }
+
+  os.write(reinterpret_cast<const char *>(&header), sizeof(header));
+  os.write(reinterpret_cast<const char *>(samples_int16.data()),
+           samples_int16.size() * sizeof(int16_t));
+
+  if (!os) {
+    NCNN_LOGE("Write %s failed", filename.c_str());
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace sherpa_ncnn

+ 40 - 0
sherpa-ncnn/csrc/wave-writer.h

@@ -0,0 +1,40 @@
+/**
+ * Copyright (c)  2022-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SHERPA_NCNN_CSRC_WAVE_WRITER_H_
+#define SHERPA_NCNN_CSRC_WAVE_WRITER_H_
+
+#include <cstdint>
+#include <string>
+
+namespace sherpa_ncnn {
+
+// Write a single channel wave file.
+// Note that the input samples are in the range [-1, 1]. It will be multiplied
+// by 32767 and saved in int16_t format in the wave file.
+//
+// @param filename Path to save the samples.
+// @param sampling_rate Sample rate of the samples.
+// @param samples Pointer to the samples
+// @param n Number of samples
+// @return Return true if the write succeeds; return false otherwise.
+bool WriteWave(const std::string &filename, int32_t sampling_rate,
+               const float *samples, int32_t n);
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_WAVE_WRITER_H_

+ 2 - 2
swift-api-examples/run-decode-file.sh

@@ -23,8 +23,8 @@ if [ ! -e ./decode-file ]; then
     -I ../build-swift-macos/sherpa-ncnn.xcframework/Headers/ \
     -import-objc-header ./SherpaNcnn-Bridging-Header.h \
     ./decode-file.swift  ./SherpaNcnn.swift \
-    -L ../build-swift-macos/openmp.xcframework/macos-x86_64 \
-    -L ../build-swift-macos/sherpa-ncnn.xcframework/macos-x86_64 \
+    -L ../build-swift-macos/openmp.xcframework/macos-arm64_x86_64 \
+    -L ../build-swift-macos/sherpa-ncnn.xcframework/macos-arm64_x86_64 \
     -l sherpa-ncnn \
     -l omp \
     -o decode-file