sherpa_ncnn.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. /*
  2. Speech recognition with [Next-gen Kaldi].
  3. [sherpa-ncnn] is an open-source speech recognition framework for [Next-gen Kaldi].
  4. It depends only on [ncnn], supporting both streaming and non-streaming
  5. speech recognition.
  6. It does not need to access the network during recognition and everything
  7. runs locally.
  8. It supports a variety of platforms, such as Linux (x86_64, aarch64, arm),
  9. Windows (x86_64, x86), macOS (x86_64, arm64), RISC-V, etc.
  10. Usage examples:
  11. 1. Real-time speech recognition from a microphone
  12. Please see
  13. https://github.com/k2-fsa/sherpa-ncnn/tree/master/go-api-examples/real-time-speech-recognition-from-microphone
  14. 2. Decode a file
  15. Please see
  16. https://github.com/k2-fsa/sherpa-ncnn/tree/master/go-api-examples/decode-file
  17. [sherpa-ncnn]: https://github.com/k2-fsa/sherpa-ncnn
  18. [ncnn]: https://github.com/tencent/ncnn
  19. [Next-gen Kaldi]: https://github.com/k2-fsa/
  20. */
  21. package sherpa_ncnn
  22. // #include <stdlib.h>
  23. // #include "c-api.h"
  24. import "C"
  25. import "unsafe"
  26. // Please refer to
  27. // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/
  28. // to download pre-trained models
  29. type ModelConfig struct {
  30. EncoderParam string // Path to the encoder.ncnn.param
  31. EncoderBin string // Path to the encoder.ncnn.bin
  32. DecoderParam string // Path to the decoder.ncnn.param
  33. DecoderBin string // Path to the decoder.ncnn.bin
  34. JoinerParam string // Path to the joiner.ncnn.param
  35. JoinerBin string // Path to the joiner.ncnn.bin
  36. Tokens string // Path to tokens.txt
  37. NumThreads int // Number of threads to use for neural network computation
  38. }
  39. // Configuration for the feature extractor
  40. type FeatureConfig struct {
  41. // Sample rate expected by the model. It is 16000 for all
  42. // pre-trained models provided by us
  43. SampleRate int
  44. // Feature dimension expected by the model. It is 80 for all
  45. // pre-trained models provided by us
  46. FeatureDim int
  47. }
  48. // Configuration for the beam search decoder
  49. type DecoderConfig struct {
  50. // Decoding method. Supported values are:
  51. // greedy_search, modified_beam_search
  52. DecodingMethod string
  53. // Number of active paths for modified_beam_search.
  54. // It is ignored when decoding_method is greedy_search.
  55. NumActivePaths int
  56. }
  57. // Configuration for the online/streaming recognizer.
  58. type RecognizerConfig struct {
  59. Feat FeatureConfig
  60. Model ModelConfig
  61. Decoder DecoderConfig
  62. EnableEndpoint int // 1 to enable endpoint detection.
  63. // Please see
  64. // https://k2-fsa.github.io/sherpa/ncnn/endpoint.html
  65. // for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence
  66. // and Rule3MinUtteranceLength.
  67. Rule1MinTrailingSilence float32
  68. Rule2MinTrailingSilence float32
  69. Rule3MinUtteranceLength float32
  70. HotwordsFile string
  71. HotwordsScore float32
  72. }
  73. // It contains the recognition result for a online stream.
  74. type RecognizerResult struct {
  75. Text string
  76. }
  77. // The online recognizer class. It wraps a pointer from C.
  78. type Recognizer struct {
  79. impl *C.struct_SherpaNcnnRecognizer
  80. }
  81. // The online stream class. It wraps a pointer from C.
  82. type Stream struct {
  83. impl *C.struct_SherpaNcnnStream
  84. }
  85. // Free the internal pointer inside the recognizer to avoid memory leak.
  86. func DeleteRecognizer(recognizer *Recognizer) {
  87. C.DestroyRecognizer(recognizer.impl)
  88. recognizer.impl = nil
  89. }
  90. // The user is responsible to invoke [DeleteRecognizer]() to free
  91. // the returned recognizer to avoid memory leak
  92. func NewRecognizer(config *RecognizerConfig) *Recognizer {
  93. c := C.struct_SherpaNcnnRecognizerConfig{}
  94. c.feat_config.sampling_rate = C.float(config.Feat.SampleRate)
  95. c.feat_config.feature_dim = C.int(config.Feat.FeatureDim)
  96. c.model_config.encoder_param = C.CString(config.Model.EncoderParam)
  97. defer C.free(unsafe.Pointer(c.model_config.encoder_param))
  98. c.model_config.encoder_bin = C.CString(config.Model.EncoderBin)
  99. defer C.free(unsafe.Pointer(c.model_config.encoder_bin))
  100. c.model_config.decoder_param = C.CString(config.Model.DecoderParam)
  101. defer C.free(unsafe.Pointer(c.model_config.decoder_param))
  102. c.model_config.decoder_bin = C.CString(config.Model.DecoderBin)
  103. defer C.free(unsafe.Pointer(c.model_config.decoder_bin))
  104. c.model_config.joiner_param = C.CString(config.Model.JoinerParam)
  105. defer C.free(unsafe.Pointer(c.model_config.joiner_param))
  106. c.model_config.joiner_bin = C.CString(config.Model.JoinerBin)
  107. defer C.free(unsafe.Pointer(c.model_config.joiner_bin))
  108. c.model_config.tokens = C.CString(config.Model.Tokens)
  109. defer C.free(unsafe.Pointer(c.model_config.tokens))
  110. c.model_config.use_vulkan_compute = C.int(0)
  111. c.model_config.num_threads = C.int(config.Model.NumThreads)
  112. c.decoder_config.decoding_method = C.CString(config.Decoder.DecodingMethod)
  113. defer C.free(unsafe.Pointer(c.decoder_config.decoding_method))
  114. c.decoder_config.num_active_paths = C.int(config.Decoder.NumActivePaths)
  115. c.enable_endpoint = C.int(config.EnableEndpoint)
  116. c.rule1_min_trailing_silence = C.float(config.Rule1MinTrailingSilence)
  117. c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
  118. c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
  119. c.hotwords_file = C.CString(config.HotwordsFile)
  120. defer C.free(unsafe.Pointer(c.hotwords_file))
  121. c.hotwords_score = C.float(config.HotwordsScore)
  122. recognizer := &Recognizer{}
  123. recognizer.impl = C.CreateRecognizer(&c)
  124. return recognizer
  125. }
  126. // Delete the internal pointer inside the stream to avoid memory leak.
  127. func DeleteStream(stream *Stream) {
  128. C.DestroyStream(stream.impl)
  129. stream.impl = nil
  130. }
  131. // The user is responsible to invoke [DeleteStream]() to free
  132. // the returned stream to avoid memory leak
  133. func NewStream(recognizer *Recognizer) *Stream {
  134. stream := &Stream{}
  135. stream.impl = C.CreateStream(recognizer.impl)
  136. return stream
  137. }
  138. // Input audio samples for the stream.
  139. //
  140. // sampleRate is the actual sample rate of the input audio samples. If it
  141. // is different from the sample rate expected by the feature extractor, we will
  142. // do resampling inside.
  143. //
  144. // samples contains audio samples. Each sample is in the range [-1, 1]
  145. func (s *Stream) AcceptWaveform(sampleRate int, samples []float32) {
  146. C.AcceptWaveform(s.impl, C.float(sampleRate), (*C.float)(&samples[0]), C.int(len(samples)))
  147. }
  148. // Signal that there will be no incoming audio samples.
  149. // After calling this function, you cannot call [Stream.AcceptWaveform] any longer.
  150. //
  151. // The main purpose of this function is to flush the remaining audio samples
  152. // buffered inside for feature extraction.
  153. func (s *Stream) InputFinished() {
  154. C.InputFinished(s.impl)
  155. }
  156. // Check whether the stream has enough feature frames for decoding.
  157. // Return true if this stream is ready for decoding. Return false otherwise.
  158. //
  159. // You will usually use it like below:
  160. //
  161. // for recognizer.IsReady(s) {
  162. // recognizer.Decode(s)
  163. // }
  164. func (recognizer *Recognizer) IsReady(s *Stream) bool {
  165. return C.IsReady(recognizer.impl, s.impl) == 1
  166. }
  167. // Return true if an endpoint is detected.
  168. //
  169. // You usually use it like below:
  170. //
  171. // if recognizer.IsEndpoint(s) {
  172. // // do your own stuff after detecting an endpoint
  173. //
  174. // recognizer.Reset(s)
  175. // }
  176. func (recognizer *Recognizer) IsEndpoint(s *Stream) bool {
  177. return C.IsEndpoint(recognizer.impl, s.impl) == 1
  178. }
  179. // After calling this function, the internal neural network model states
  180. // are reset and IsEndpoint(s) would return false. GetResult(s) would also
  181. // return an empty string.
  182. func (recognizer *Recognizer) Reset(s *Stream) {
  183. C.Reset(recognizer.impl, s.impl)
  184. }
  185. // Decode the stream. Before calling this function, you have to ensure
  186. // that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.
  187. //
  188. // You usually use it like below:
  189. //
  190. // for recognizer.IsReady(s) {
  191. // recognizer.Decode(s)
  192. // }
  193. func (recognizer *Recognizer) Decode(s *Stream) {
  194. C.Decode(recognizer.impl, s.impl)
  195. }
  196. // Get the current result of stream since the last invoke of Reset()
  197. func (recognizer *Recognizer) GetResult(s *Stream) *RecognizerResult {
  198. p := C.GetResult(recognizer.impl, s.impl)
  199. defer C.DestroyResult(p)
  200. result := &RecognizerResult{}
  201. result.Text = C.GoString(p.text)
  202. return result
  203. }