sherpa_ncnn.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /*
  2. Speech recognition with [Next-gen Kaldi].
  3. [sherpa-ncnn] is an open-source speech recognition framework for [Next-gen Kaldi].
  4. It depends only on [ncnn], supporting both streaming and non-streaming
  5. speech recognition.
  6. It does not need to access the network during recognition and everything
  7. runs locally.
  8. It supports a variety of platforms, such as Linux (x86_64, aarch64, arm),
  9. Windows (x86_64, x86), macOS (x86_64, arm64), RISC-V, etc.
  10. Usage examples:
  11. 1. Real-time speech recognition from a microphone
  12. Please see
  13. https://github.com/k2-fsa/sherpa-ncnn/tree/master/go-api-examples/real-time-speech-recognition-from-microphone
  14. 2. Decode a file
  15. Please see
  16. https://github.com/k2-fsa/sherpa-ncnn/tree/master/go-api-examples/decode-file
  17. [sherpa-ncnn]: https://github.com/k2-fsa/sherpa-ncnn
  18. [ncnn]: https://github.com/tencent/ncnn
  19. [Next-gen Kaldi]: https://github.com/k2-fsa/
  20. */
  21. package sherpa_ncnn
  22. // #include <stdlib.h>
  23. // #include "c-api.h"
  24. import "C"
  25. import "unsafe"
  26. // Please refer to
  27. // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/
  28. // to download pre-trained models
  29. type ModelConfig struct {
  30. EncoderParam string // Path to the encoder.ncnn.param
  31. EncoderBin string // Path to the encoder.ncnn.bin
  32. DecoderParam string // Path to the decoder.ncnn.param
  33. DecoderBin string // Path to the decoder.ncnn.bin
  34. JoinerParam string // Path to the joiner.ncnn.param
  35. JoinerBin string // Path to the joiner.ncnn.bin
  36. Tokens string // Path to tokens.txt
  37. NumThreads int // Number of threads to use for neural network computation
  38. }
  39. // Configuration for the feature extractor
  40. type FeatureConfig struct {
  41. // Sample rate expected by the model. It is 16000 for all
  42. // pre-trained models provided by us
  43. SampleRate int
  44. // Feature dimension expected by the model. It is 80 for all
  45. // pre-trained models provided by us
  46. FeatureDim int
  47. }
  48. // Configuration for the beam search decoder
  49. type DecoderConfig struct {
  50. // Decoding method. Supported values are:
  51. // greedy_search, modified_beam_search
  52. DecodingMethod string
  53. // Number of active paths for modified_beam_search.
  54. // It is ignored when decoding_method is greedy_search.
  55. NumActivePaths int
  56. }
  57. // Configuration for the online/streaming recognizer.
  58. type RecognizerConfig struct {
  59. Feat FeatureConfig
  60. Model ModelConfig
  61. Decoder DecoderConfig
  62. EnableEndpoint int // 1 to enable endpoint detection.
  63. // Please see
  64. // https://k2-fsa.github.io/sherpa/ncnn/endpoint.html
  65. // for the meaning of Rule1MinTrailingSilence, Rule2MinTrailingSilence
  66. // and Rule3MinUtteranceLength.
  67. Rule1MinTrailingSilence float32
  68. Rule2MinTrailingSilence float32
  69. Rule3MinUtteranceLength float32
  70. }
  71. // It contains the recognition result for a online stream.
  72. type RecognizerResult struct {
  73. Text string
  74. }
  75. // The online recognizer class. It wraps a pointer from C.
  76. type Recognizer struct {
  77. impl *C.struct_SherpaNcnnRecognizer
  78. }
  79. // The online stream class. It wraps a pointer from C.
  80. type Stream struct {
  81. impl *C.struct_SherpaNcnnStream
  82. }
  83. // Free the internal pointer inside the recognizer to avoid memory leak.
  84. func DeleteRecognizer(recognizer *Recognizer) {
  85. C.DestroyRecognizer(recognizer.impl)
  86. recognizer.impl = nil
  87. }
  88. // The user is responsible to invoke [DeleteRecognizer]() to free
  89. // the returned recognizer to avoid memory leak
  90. func NewRecognizer(config *RecognizerConfig) *Recognizer {
  91. c := C.struct_SherpaNcnnRecognizerConfig{}
  92. c.feat_config.sampling_rate = C.float(config.Feat.SampleRate)
  93. c.feat_config.feature_dim = C.int(config.Feat.FeatureDim)
  94. c.model_config.encoder_param = C.CString(config.Model.EncoderParam)
  95. defer C.free(unsafe.Pointer(c.model_config.encoder_param))
  96. c.model_config.encoder_bin = C.CString(config.Model.EncoderBin)
  97. defer C.free(unsafe.Pointer(c.model_config.encoder_bin))
  98. c.model_config.decoder_param = C.CString(config.Model.DecoderParam)
  99. defer C.free(unsafe.Pointer(c.model_config.decoder_param))
  100. c.model_config.decoder_bin = C.CString(config.Model.DecoderBin)
  101. defer C.free(unsafe.Pointer(c.model_config.decoder_bin))
  102. c.model_config.joiner_param = C.CString(config.Model.JoinerParam)
  103. defer C.free(unsafe.Pointer(c.model_config.joiner_param))
  104. c.model_config.joiner_bin = C.CString(config.Model.JoinerBin)
  105. defer C.free(unsafe.Pointer(c.model_config.joiner_bin))
  106. c.model_config.tokens = C.CString(config.Model.Tokens)
  107. defer C.free(unsafe.Pointer(c.model_config.tokens))
  108. c.model_config.use_vulkan_compute = C.int(0)
  109. c.model_config.num_threads = C.int(config.Model.NumThreads)
  110. c.decoder_config.decoding_method = C.CString(config.Decoder.DecodingMethod)
  111. defer C.free(unsafe.Pointer(c.decoder_config.decoding_method))
  112. c.decoder_config.num_active_paths = C.int(config.Decoder.NumActivePaths)
  113. c.enable_endpoint = C.int(config.EnableEndpoint)
  114. c.rule1_min_trailing_silence = C.float(config.Rule1MinTrailingSilence)
  115. c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
  116. c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
  117. recognizer := &Recognizer{}
  118. recognizer.impl = C.CreateRecognizer(&c)
  119. return recognizer
  120. }
  121. // Delete the internal pointer inside the stream to avoid memory leak.
  122. func DeleteStream(stream *Stream) {
  123. C.DestroyStream(stream.impl)
  124. stream.impl = nil
  125. }
  126. // The user is responsible to invoke [DeleteStream]() to free
  127. // the returned stream to avoid memory leak
  128. func NewStream(recognizer *Recognizer) *Stream {
  129. stream := &Stream{}
  130. stream.impl = C.CreateStream(recognizer.impl)
  131. return stream
  132. }
  133. // Input audio samples for the stream.
  134. //
  135. // sampleRate is the actual sample rate of the input audio samples. If it
  136. // is different from the sample rate expected by the feature extractor, we will
  137. // do resampling inside.
  138. //
  139. // samples contains audio samples. Each sample is in the range [-1, 1]
  140. func (s *Stream) AcceptWaveform(sampleRate int, samples []float32) {
  141. C.AcceptWaveform(s.impl, C.float(sampleRate), (*C.float)(&samples[0]), C.int(len(samples)))
  142. }
  143. // Signal that there will be no incoming audio samples.
  144. // After calling this function, you cannot call [Stream.AcceptWaveform] any longer.
  145. //
  146. // The main purpose of this function is to flush the remaining audio samples
  147. // buffered inside for feature extraction.
  148. func (s *Stream) InputFinished() {
  149. C.InputFinished(s.impl)
  150. }
  151. // Check whether the stream has enough feature frames for decoding.
  152. // Return true if this stream is ready for decoding. Return false otherwise.
  153. //
  154. // You will usually use it like below:
  155. //
  156. // for recognizer.IsReady(s) {
  157. // recognizer.Decode(s)
  158. // }
  159. func (recognizer *Recognizer) IsReady(s *Stream) bool {
  160. return C.IsReady(recognizer.impl, s.impl) == 1
  161. }
  162. // Return true if an endpoint is detected.
  163. //
  164. // You usually use it like below:
  165. //
  166. // if recognizer.IsEndpoint(s) {
  167. // // do your own stuff after detecting an endpoint
  168. //
  169. // recognizer.Reset(s)
  170. // }
  171. func (recognizer *Recognizer) IsEndpoint(s *Stream) bool {
  172. return C.IsEndpoint(recognizer.impl, s.impl) == 1
  173. }
  174. // After calling this function, the internal neural network model states
  175. // are reset and IsEndpoint(s) would return false. GetResult(s) would also
  176. // return an empty string.
  177. func (recognizer *Recognizer) Reset(s *Stream) {
  178. C.Reset(recognizer.impl, s.impl)
  179. }
  180. // Decode the stream. Before calling this function, you have to ensure
  181. // that recognizer.IsReady(s) returns true. Otherwise, you will be SAD.
  182. //
  183. // You usually use it like below:
  184. //
  185. // for recognizer.IsReady(s) {
  186. // recognizer.Decode(s)
  187. // }
  188. func (recognizer *Recognizer) Decode(s *Stream) {
  189. C.Decode(recognizer.impl, s.impl)
  190. }
  191. // Get the current result of stream since the last invoke of Reset()
  192. func (recognizer *Recognizer) GetResult(s *Stream) *RecognizerResult {
  193. p := C.GetResult(recognizer.impl, s.impl)
  194. defer C.DestroyResult(p)
  195. result := &RecognizerResult{}
  196. result.Text = C.GoString(p.text)
  197. return result
  198. }