SherpaNcnn.swift 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. /// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
  2. ///
  3. /// See LICENSE for clarification regarding multiple authors
  4. ///
  5. /// Licensed under the Apache License, Version 2.0 (the "License");
  6. /// you may not use this file except in compliance with the License.
  7. /// You may obtain a copy of the License at
  8. ///
  9. /// http://www.apache.org/licenses/LICENSE-2.0
  10. ///
  11. /// Unless required by applicable law or agreed to in writing, software
  12. /// distributed under the License is distributed on an "AS IS" BASIS,
  13. /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. /// See the License for the specific language governing permissions and
  15. /// limitations under the License.
  16. import Foundation // For NSString
  17. /// Convert a String from swift to a `const char*` so that we can pass it to
  18. /// the C language.
  19. ///
  20. /// - Parameters:
  21. /// - s: The String to convert.
  22. /// - Returns: A pointer that can be passed to C as `const char*`
  23. func toCPointer(_ s: String) -> UnsafePointer<Int8>! {
  24. let cs = (s as NSString).utf8String
  25. return UnsafePointer<Int8>(cs)
  26. }
  27. /// Return an instance of SherpaNcnnModelConfig.
  28. ///
  29. /// Please refer to
  30. /// https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
  31. /// to download the required `.ncnn.param` and `.ncnn.bin` files.
  32. ///
  33. /// - Parameters:
  34. /// - encoderParam: Path to encoder.ncnn.param
  35. /// - encoderBin: Path to encoder.ncnn.bin
  36. /// - decoderParam: Path to decoder.ncnn.param
  37. /// - decoderBin: Path to decoder.ncnn.bin
  38. /// - joinerParam: Path to joiner.ncnn.param
  39. /// - joinerBin: Path to joiner.ncnn.bin
  40. /// - tokens.txt: Path to tokens.txt
  41. /// - useVulkanCompute: It if it true, and if sherpa-ncnn is compiled with
  42. /// vulkan support, and if there are GPUs available, then
  43. /// it will use GPU for neural network computation.
  44. /// Otherwise, it uses CPU for computation.
  45. /// - numThreads.txt: Number of threads to use for neural
  46. /// network computation.
  47. ///
  48. /// - Returns: Return an instance of SherpaNcnnModelConfig
  49. func sherpaNcnnModelConfig(
  50. encoderParam: String,
  51. encoderBin: String,
  52. decoderParam: String,
  53. decoderBin: String,
  54. joinerParam: String,
  55. joinerBin: String,
  56. tokens: String,
  57. numThreads: Int = 4,
  58. useVulkanCompute: Bool = true
  59. ) -> SherpaNcnnModelConfig {
  60. return SherpaNcnnModelConfig(
  61. encoder_param: toCPointer(encoderParam),
  62. encoder_bin: toCPointer(encoderBin),
  63. decoder_param: toCPointer(decoderParam),
  64. decoder_bin: toCPointer(decoderBin),
  65. joiner_param: toCPointer(joinerParam),
  66. joiner_bin: toCPointer(joinerBin),
  67. tokens: toCPointer(tokens),
  68. use_vulkan_compute: useVulkanCompute ? 1 : 0,
  69. num_threads: Int32(numThreads))
  70. }
  71. func sherpaNcnnFeatureExtractorConfig(
  72. sampleRate: Float,
  73. featureDim: Int,
  74. maxFeatureVectors: Int
  75. )-> SherpaNcnnFeatureExtractorConfig {
  76. return SherpaNcnnFeatureExtractorConfig(
  77. sampling_rate: sampleRate,
  78. feature_dim: Int32(featureDim),
  79. max_feature_vectors: Int32(maxFeatureVectors))
  80. }
  81. /// Create an instance of SherpaNcnnDecoderConfig
  82. ///
  83. /// - Parameters:
  84. /// - decodingMethod: Valid decoding methods are "greedy_search"
  85. /// and "modified_beam_search"
  86. /// - numActivePaths: Used only when decodingMethod is "modified_beam_search".
  87. /// It specifies the beam size for beam search.
  88. /// - enableEndpoint: true to enable endpoint detection. False to disable
  89. /// endpoint detection.
  90. /// - rule1MinTrailingSilence: An endpoint is detected if trailing silence in
  91. /// seconds is larger than this value even if
  92. /// nothing has been decoded. Used only when
  93. /// enable_endpoint is true.
  94. /// - rule2MinTrailingSilence: An endpoint is detected if trailing silence in
  95. /// seconds is larger than this value even after
  96. /// something that is not blank has been decoded.
  97. /// Used only when enable_endpoint is true.
  98. /// - rule3MinUtteranceLength: An endpoint is detected if the utterance in
  99. /// seconds is larger than this value.
  100. /// Used only when enable_endpoint is true.
  101. func sherpaNcnnDecoderConfig(
  102. decodingMethod: String = "greedy_search",
  103. numActivePaths: Int = 4
  104. ) -> SherpaNcnnDecoderConfig {
  105. return SherpaNcnnDecoderConfig(
  106. decoding_method: toCPointer(decodingMethod),
  107. num_active_paths: Int32(numActivePaths))
  108. }
  109. func sherpaNcnnRecognizerConfig(
  110. featConfig: SherpaNcnnFeatureExtractorConfig,
  111. modelConfig: SherpaNcnnModelConfig,
  112. decoderConfig: SherpaNcnnDecoderConfig,
  113. enableEndpoint: Bool = false,
  114. rule1MinTrailingSilence: Float = 2.4,
  115. rule2MinTrailingSilence: Float = 1.2,
  116. rule3MinUtteranceLength: Float = 30
  117. ) -> SherpaNcnnRecognizerConfig {
  118. return SherpaNcnnRecognizerConfig(
  119. feat_config: featConfig,
  120. model_config: modelConfig,
  121. decoder_config: decoderConfig,
  122. enable_endpoint: enableEndpoint ? 1 : 0,
  123. rule1_min_trailing_silence: rule1MinTrailingSilence,
  124. rule2_min_trailing_silence: rule2MinTrailingSilence,
  125. rule3_min_utterance_length: rule3MinUtteranceLength)
  126. }
  127. /// Wrapper for recognition result.
  128. ///
  129. /// Usage:
  130. ///
  131. /// let result = recognizer.getResult()
  132. /// print("text: \(result.text)")
  133. ///
  134. class SherpaNcnnRecongitionResult {
  135. /// A pointer to the underlying counterpart in C
  136. let result: UnsafePointer<SherpaNcnnResult>!
  137. /// Return the actual recognition result.
  138. /// For English models, it contains words separated by spaces.
  139. /// For Chinese models, it contains Chinese words.
  140. var text: String {
  141. return String(cString: result.pointee.text)
  142. }
  143. init(result: UnsafePointer<SherpaNcnnResult>!) {
  144. self.result = result
  145. }
  146. deinit {
  147. if let result {
  148. DestroyResult(result)
  149. }
  150. }
  151. }
  152. class SherpaNcnnRecognizer {
  153. /// A pointer to the underlying counterpart in C
  154. let recognizer: OpaquePointer!
  155. let stream: OpaquePointer!
  156. /// Constructor taking a model config and a decoder config.
  157. init(
  158. config: UnsafePointer<SherpaNcnnRecognizerConfig>!
  159. ) {
  160. recognizer = CreateRecognizer(config)
  161. stream = CreateStream(recognizer)
  162. }
  163. deinit {
  164. if let stream {
  165. DestroyStream(stream)
  166. }
  167. if let recognizer {
  168. DestroyRecognizer(recognizer)
  169. }
  170. }
  171. /// Decode wave samples.
  172. ///
  173. /// - Parameters:
  174. /// - samples: Audio samples normalzed to the range [-1, 1]
  175. /// - sampleRate: Sample rate of the input audio samples. If it is
  176. /// different from featConfig.sampleRate, we will do
  177. /// resample. Caution: You cannot use a different
  178. /// sampleRate across different calls to
  179. /// AcceptWaveform().
  180. func acceptWaveform(samples: [Float], sampleRate: Float = 16000) {
  181. AcceptWaveform(stream, sampleRate, samples, Int32(samples.count))
  182. }
  183. func isReady() -> Bool {
  184. return IsReady(recognizer, stream) == 1 ? true : false
  185. }
  186. /// If there are enough number of feature frames, it invokes the neural
  187. /// network computation and decoding. Otherwise, it is a no-op.
  188. func decode() {
  189. Decode(recognizer, stream)
  190. }
  191. /// Get the decoding results so far
  192. func getResult() -> SherpaNcnnRecongitionResult {
  193. let result: UnsafeMutablePointer<SherpaNcnnResult>? = GetResult(recognizer, stream)
  194. return SherpaNcnnRecongitionResult(result: result)
  195. }
  196. /// Reset the recognizer, which clears the neural network model state
  197. /// and the state for decoding.
  198. func reset() {
  199. Reset(recognizer, stream)
  200. }
  201. /// Signal that no more audio samples would be available.
  202. /// After this call, you cannot call acceptWaveform() any more.
  203. func inputFinished() {
  204. InputFinished(stream)
  205. }
  206. /// Return true is an endpoint has been detected.
  207. func isEndpoint() -> Bool {
  208. return IsEndpoint(recognizer, stream) == 1 ? true : false
  209. }
  210. }