SherpaNcnnViewModel.swift 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. //
  2. // SherpaNcnnViewModel.swift
  3. // SherpaNcnn
  4. //
  5. // Created by knight on 2023/4/5.
  6. //
  7. import Foundation
  8. import AVFoundation
  9. enum Status {
  10. case stop
  11. case recording
  12. }
  13. class SherpaNcnnViewModel: ObservableObject {
  14. @Published var status: Status = .stop
  15. @Published var subtitles: String = ""
  16. var sentences: [String] = []
  17. var audioEngine: AVAudioEngine? = nil
  18. var recognizer: SherpaNcnnRecognizer! = nil
  19. var lastSentence: String = ""
  20. let maxSentence: Int = 20
  21. var results: String {
  22. if sentences.isEmpty && lastSentence.isEmpty {
  23. return ""
  24. }
  25. if sentences.isEmpty {
  26. return "0: \(lastSentence.lowercased())"
  27. }
  28. let start = max(sentences.count - maxSentence, 0)
  29. if lastSentence.isEmpty {
  30. return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
  31. .joined(separator: "\n")
  32. } else {
  33. return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
  34. .joined(separator: "\n") + "\n\(sentences.count): \(lastSentence.lowercased())"
  35. }
  36. }
  37. func updateLabel() {
  38. DispatchQueue.main.async {
  39. self.subtitles = self.results
  40. }
  41. }
  42. init() {
  43. initRecognizer()
  44. initRecorder()
  45. }
  46. private func initRecognizer() {
  47. // Please select one model that is best suitable for you.
  48. //
  49. // You can also modify Model.swift to add new pre-trained models from
  50. // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
  51. let featConfig = sherpaNcnnFeatureExtractorConfig(
  52. sampleRate: 16000,
  53. featureDim: 80)
  54. let modelConfig = getMultilingualModelConfig2022_12_06()
  55. // let modelConfig = getMultilingualModelConfig2022_12_06_Int8()
  56. // let modelConfig = getConvEmformerSmallEnglishModelConfig2023_01_09()
  57. // let modelConfig = getConvEmformerSmallEnglishModelConfig2023_01_09_Int8()
  58. // let modelConfig = getLstmTransducerEnglish_2022_09_05()
  59. let decoderConfig = sherpaNcnnDecoderConfig(
  60. decodingMethod: "modified_beam_search",
  61. numActivePaths: 4)
  62. var config = sherpaNcnnRecognizerConfig(
  63. featConfig: featConfig,
  64. modelConfig: modelConfig,
  65. decoderConfig: decoderConfig,
  66. enableEndpoint: true,
  67. rule1MinTrailingSilence: 1.2,
  68. rule2MinTrailingSilence: 2.4,
  69. rule3MinUtteranceLength: 200)
  70. recognizer = SherpaNcnnRecognizer(config: &config)
  71. }
  72. private func initRecorder() {
  73. print("init recorder")
  74. audioEngine = AVAudioEngine()
  75. let inputNode = self.audioEngine?.inputNode
  76. let bus = 0
  77. let inputFormat = inputNode?.outputFormat(forBus: bus)
  78. let outputFormat = AVAudioFormat(
  79. commonFormat: .pcmFormatFloat32,
  80. sampleRate: 16000, channels: 1,
  81. interleaved: false)!
  82. let converter = AVAudioConverter(from: inputFormat!, to: outputFormat)!
  83. inputNode!.installTap(
  84. onBus: bus,
  85. bufferSize: 1024,
  86. format: inputFormat
  87. ) {
  88. (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
  89. var newBufferAvailable = true
  90. let inputCallback: AVAudioConverterInputBlock = {
  91. inNumPackets, outStatus in
  92. if newBufferAvailable {
  93. outStatus.pointee = .haveData
  94. newBufferAvailable = false
  95. return buffer
  96. } else {
  97. outStatus.pointee = .noDataNow
  98. return nil
  99. }
  100. }
  101. let convertedBuffer = AVAudioPCMBuffer(
  102. pcmFormat: outputFormat,
  103. frameCapacity:
  104. AVAudioFrameCount(outputFormat.sampleRate)
  105. * buffer.frameLength
  106. / AVAudioFrameCount(buffer.format.sampleRate))!
  107. var error: NSError?
  108. let _ = converter.convert(
  109. to: convertedBuffer,
  110. error: &error, withInputFrom: inputCallback)
  111. // TODO(fangjun): Handle status != haveData
  112. let array = convertedBuffer.array()
  113. if !array.isEmpty {
  114. self.recognizer.acceptWaveform(samples: array)
  115. while (self.recognizer.isReady()){
  116. self.recognizer.decode()
  117. }
  118. let isEndpoint = self.recognizer.isEndpoint()
  119. let text = self.recognizer.getResult().text
  120. if !text.isEmpty && self.lastSentence != text {
  121. self.lastSentence = text
  122. self.updateLabel()
  123. print(text)
  124. }
  125. if isEndpoint{
  126. if !text.isEmpty {
  127. let tmp = self.lastSentence
  128. self.lastSentence = ""
  129. self.sentences.append(tmp)
  130. }
  131. self.recognizer.reset()
  132. }
  133. }
  134. }
  135. }
  136. public func toggleRecorder() {
  137. if status == .stop {
  138. startRecorder()
  139. status = .recording
  140. } else {
  141. stopRecorder()
  142. status = .stop
  143. }
  144. }
  145. private func startRecorder() {
  146. lastSentence = ""
  147. sentences = []
  148. do {
  149. try self.audioEngine?.start()
  150. } catch let error as NSError {
  151. print("Got an error starting audioEngine: \(error.domain), \(error)")
  152. }
  153. print("started")
  154. }
  155. private func stopRecorder() {
  156. audioEngine?.stop()
  157. print("stopped")
  158. }
  159. }