RealtimeSpeechRecognitionDlg.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. // RealtimeSpeechRecognitionDlg.cpp : implementation file
  2. //
  3. // clang-format off
  4. #include "pch.h"
  5. #include "framework.h"
  6. #include "afxdialogex.h"
  7. // clang-format on
  8. #include "RealtimeSpeechRecognitionDlg.h"
  9. #include <fstream>
  10. #include <sstream>
  11. #include <string>
  12. #include <vector>
  13. #include "RealtimeSpeechRecognition.h"
  14. #ifdef _DEBUG
  15. #define new DEBUG_NEW
  16. #endif
  17. Microphone::Microphone() {
  18. PaError err = Pa_Initialize();
  19. if (err != paNoError) {
  20. fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  21. exit(-2);
  22. }
  23. }
  24. Microphone::~Microphone() {
  25. PaError err = Pa_Terminate();
  26. if (err != paNoError) {
  27. fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  28. exit(-2);
  29. }
  30. }
  31. // CRealtimeSpeechRecognitionDlg dialog
  32. CRealtimeSpeechRecognitionDlg::CRealtimeSpeechRecognitionDlg(
  33. CWnd *pParent /*=nullptr*/)
  34. : CDialogEx(IDD_REALTIMESPEECHRECOGNITION_DIALOG, pParent) {
  35. m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME);
  36. }
  37. CRealtimeSpeechRecognitionDlg::~CRealtimeSpeechRecognitionDlg() {
  38. if (recognizer_) {
  39. DestroyRecognizer(recognizer_);
  40. recognizer_ = nullptr;
  41. }
  42. }
  43. void CRealtimeSpeechRecognitionDlg::DoDataExchange(CDataExchange *pDX) {
  44. CDialogEx::DoDataExchange(pDX);
  45. DDX_Control(pDX, IDOK, my_btn_);
  46. DDX_Control(pDX, IDC_EDIT1, my_text_);
  47. }
  48. BEGIN_MESSAGE_MAP(CRealtimeSpeechRecognitionDlg, CDialogEx)
  49. ON_WM_PAINT()
  50. ON_WM_QUERYDRAGICON()
  51. ON_BN_CLICKED(IDOK, &CRealtimeSpeechRecognitionDlg::OnBnClickedOk)
  52. END_MESSAGE_MAP()
  53. // CRealtimeSpeechRecognitionDlg message handlers
  54. BOOL CRealtimeSpeechRecognitionDlg::OnInitDialog() {
  55. CDialogEx::OnInitDialog();
  56. // Set the icon for this dialog. The framework does this automatically
  57. // when the application's main window is not a dialog
  58. SetIcon(m_hIcon, TRUE); // Set big icon
  59. SetIcon(m_hIcon, FALSE); // Set small icon
  60. // TODO: Add extra initialization here
  61. InitMicrophone();
  62. return TRUE; // return TRUE unless you set the focus to a control
  63. }
  64. // If you add a minimize button to your dialog, you will need the code below
  65. // to draw the icon. For MFC applications using the document/view model,
  66. // this is automatically done for you by the framework.
  67. void CRealtimeSpeechRecognitionDlg::OnPaint() {
  68. if (IsIconic()) {
  69. CPaintDC dc(this); // device context for painting
  70. SendMessage(WM_ICONERASEBKGND, reinterpret_cast<WPARAM>(dc.GetSafeHdc()),
  71. 0);
  72. // Center icon in client rectangle
  73. int cxIcon = GetSystemMetrics(SM_CXICON);
  74. int cyIcon = GetSystemMetrics(SM_CYICON);
  75. CRect rect;
  76. GetClientRect(&rect);
  77. int x = (rect.Width() - cxIcon + 1) / 2;
  78. int y = (rect.Height() - cyIcon + 1) / 2;
  79. // Draw the icon
  80. dc.DrawIcon(x, y, m_hIcon);
  81. } else {
  82. CDialogEx::OnPaint();
  83. }
  84. }
  85. // The system calls this function to obtain the cursor to display while the user
  86. // drags
  87. // the minimized window.
  88. HCURSOR CRealtimeSpeechRecognitionDlg::OnQueryDragIcon() {
  89. return static_cast<HCURSOR>(m_hIcon);
  90. }
  91. // see
  92. // https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
  93. static std::wstring Utf8ToUtf16(const std::string &utf8) {
  94. std::vector<unsigned long> unicode;
  95. size_t i = 0;
  96. while (i < utf8.size()) {
  97. unsigned long uni;
  98. size_t todo;
  99. bool error = false;
  100. unsigned char ch = utf8[i++];
  101. if (ch <= 0x7F) {
  102. uni = ch;
  103. todo = 0;
  104. } else if (ch <= 0xBF) {
  105. throw std::logic_error("not a UTF-8 string");
  106. } else if (ch <= 0xDF) {
  107. uni = ch & 0x1F;
  108. todo = 1;
  109. } else if (ch <= 0xEF) {
  110. uni = ch & 0x0F;
  111. todo = 2;
  112. } else if (ch <= 0xF7) {
  113. uni = ch & 0x07;
  114. todo = 3;
  115. } else {
  116. throw std::logic_error("not a UTF-8 string");
  117. }
  118. for (size_t j = 0; j < todo; ++j) {
  119. if (i == utf8.size()) throw std::logic_error("not a UTF-8 string");
  120. unsigned char ch = utf8[i++];
  121. if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string");
  122. uni <<= 6;
  123. uni += ch & 0x3F;
  124. }
  125. if (uni >= 0xD800 && uni <= 0xDFFF)
  126. throw std::logic_error("not a UTF-8 string");
  127. if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string");
  128. unicode.push_back(uni);
  129. }
  130. std::wstring utf16;
  131. for (size_t i = 0; i < unicode.size(); ++i) {
  132. unsigned long uni = unicode[i];
  133. if (uni <= 0xFFFF) {
  134. utf16 += (wchar_t)uni;
  135. } else {
  136. uni -= 0x10000;
  137. utf16 += (wchar_t)((uni >> 10) + 0xD800);
  138. utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);
  139. }
  140. }
  141. return utf16;
  142. }
  143. void CRealtimeSpeechRecognitionDlg::AppendTextToEditCtrl(const std::string &s) {
  144. // get the initial text length
  145. int nLength = my_text_.GetWindowTextLength();
  146. // put the selection at the end of text
  147. my_text_.SetSel(nLength, nLength);
  148. // replace the selection
  149. std::wstring wstr = Utf8ToUtf16(s);
  150. // my_text_.ReplaceSel(wstr.c_str());
  151. my_text_.ReplaceSel(wstr.c_str());
  152. }
  153. void CRealtimeSpeechRecognitionDlg::AppendLineToMultilineEditCtrl(
  154. const std::string &s) {
  155. AppendTextToEditCtrl("\r\n" + s);
  156. }
  157. void CRealtimeSpeechRecognitionDlg::InitMicrophone() {
  158. int default_device = Pa_GetDefaultInputDevice();
  159. int device_count = Pa_GetDeviceCount();
  160. if (default_device == paNoDevice) {
  161. // CString str;
  162. // str.Format(_T("No default input device found!"));
  163. // AfxMessageBox(str, MB_OK | MB_ICONSTOP);
  164. // exit(-1);
  165. AppendLineToMultilineEditCtrl("No default input device found!");
  166. my_btn_.EnableWindow(FALSE);
  167. return;
  168. }
  169. AppendLineToMultilineEditCtrl(std::string("Selected device ") +
  170. Pa_GetDeviceInfo(default_device)->name);
  171. }
  172. static int32_t RecordCallback(const void *input_buffer,
  173. void * /*output_buffer*/,
  174. unsigned long frames_per_buffer, // NOLINT
  175. const PaStreamCallbackTimeInfo * /*time_info*/,
  176. PaStreamCallbackFlags /*status_flags*/,
  177. void *user_data) {
  178. auto dlg = reinterpret_cast<CRealtimeSpeechRecognitionDlg *>(user_data);
  179. auto stream = dlg->stream_;
  180. if (stream) {
  181. AcceptWaveform(stream, 16000, reinterpret_cast<const float *>(input_buffer),
  182. frames_per_buffer);
  183. }
  184. return dlg->started_ ? paContinue : paComplete;
  185. }
  186. void CRealtimeSpeechRecognitionDlg::OnBnClickedOk() {
  187. if (!recognizer_) {
  188. AppendLineToMultilineEditCtrl("Creating recognizer...");
  189. InitRecognizer();
  190. if (!recognizer_) {
  191. // failed to create the recognizer
  192. return;
  193. }
  194. AppendLineToMultilineEditCtrl("Recognizer created!");
  195. }
  196. if (!started_) {
  197. started_ = true;
  198. if (stream_) {
  199. DestroyStream(stream_);
  200. stream_ = nullptr;
  201. }
  202. stream_ = CreateStream(recognizer_);
  203. PaStreamParameters param;
  204. param.device = Pa_GetDefaultInputDevice();
  205. const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
  206. param.channelCount = 1;
  207. param.sampleFormat = paFloat32;
  208. param.suggestedLatency = info->defaultLowInputLatency;
  209. param.hostApiSpecificStreamInfo = nullptr;
  210. float sample_rate = 16000;
  211. pa_stream_ = nullptr;
  212. PaError err =
  213. Pa_OpenStream(&pa_stream_, &param, nullptr, /* &outputParameters, */
  214. sample_rate,
  215. 0, // frames per buffer
  216. paClipOff, // we won't output out of range samples
  217. // so don't bother clipping them
  218. RecordCallback, this);
  219. if (err != paNoError) {
  220. AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
  221. Pa_GetErrorText(err));
  222. my_btn_.EnableWindow(FALSE);
  223. return;
  224. }
  225. err = Pa_StartStream(pa_stream_);
  226. if (err != paNoError) {
  227. AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
  228. Pa_GetErrorText(err));
  229. my_btn_.EnableWindow(FALSE);
  230. return;
  231. }
  232. AppendLineToMultilineEditCtrl("Started! Please speak");
  233. my_btn_.SetWindowText(_T("Stop"));
  234. thread_ = new RecognizerThread(this);
  235. thread_->CreateThread(CREATE_SUSPENDED);
  236. thread_->m_bAutoDelete = false; // Let me delete it.
  237. thread_->ResumeThread();
  238. } else {
  239. started_ = false;
  240. Pa_Sleep(200); // sleep for 200ms
  241. if (pa_stream_) {
  242. PaError err = Pa_CloseStream(pa_stream_);
  243. if (err != paNoError) {
  244. AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
  245. Pa_GetErrorText(err));
  246. my_btn_.EnableWindow(FALSE);
  247. return;
  248. }
  249. }
  250. pa_stream_ = nullptr;
  251. WaitForSingleObject(thread_->m_hThread, INFINITE);
  252. delete thread_;
  253. thread_ = nullptr;
  254. // AfxMessageBox("stopped", MB_OK);
  255. my_btn_.SetWindowText(_T("Start"));
  256. AppendLineToMultilineEditCtrl("Stopped");
  257. }
  258. }
  259. bool CRealtimeSpeechRecognitionDlg::Exists(const std::string &filename) {
  260. std::ifstream is(filename);
  261. return is.good();
  262. }
  263. void CRealtimeSpeechRecognitionDlg::InitRecognizer() {
  264. std::string encoder_param = "./encoder_jit_trace-pnnx.ncnn.param";
  265. std::string encoder_bin = "./encoder_jit_trace-pnnx.ncnn.bin";
  266. std::string decoder_param = "./decoder_jit_trace-pnnx.ncnn.param";
  267. std::string decoder_bin = "./decoder_jit_trace-pnnx.ncnn.bin";
  268. std::string joiner_param = "./joiner_jit_trace-pnnx.ncnn.param";
  269. std::string joiner_bin = "./joiner_jit_trace-pnnx.ncnn.bin";
  270. std::string tokens = "./tokens.txt";
  271. bool is_ok = true;
  272. if (!Exists(encoder_param)) {
  273. std::string msg = encoder_param + " does not exist!";
  274. AppendLineToMultilineEditCtrl(msg);
  275. is_ok = false;
  276. }
  277. if (!Exists(encoder_bin)) {
  278. std::string msg = encoder_bin + " does not exist!";
  279. AppendLineToMultilineEditCtrl(msg);
  280. is_ok = false;
  281. }
  282. if (!Exists(decoder_param)) {
  283. std::string msg = decoder_param + " does not exist!";
  284. AppendLineToMultilineEditCtrl(msg);
  285. is_ok = false;
  286. }
  287. if (!Exists(decoder_bin)) {
  288. std::string msg = decoder_bin + " does not exist!";
  289. AppendLineToMultilineEditCtrl(msg);
  290. is_ok = false;
  291. }
  292. if (!Exists(joiner_param)) {
  293. std::string msg = joiner_param + " does not exist!";
  294. AppendLineToMultilineEditCtrl(msg);
  295. is_ok = false;
  296. }
  297. if (!Exists(joiner_bin)) {
  298. std::string msg = joiner_bin + " does not exist!";
  299. AppendLineToMultilineEditCtrl(msg);
  300. is_ok = false;
  301. }
  302. if (!Exists(tokens)) {
  303. std::string msg = tokens + " does not exist!";
  304. AppendLineToMultilineEditCtrl(msg);
  305. is_ok = false;
  306. }
  307. if (!is_ok) {
  308. my_btn_.EnableWindow(FALSE);
  309. std::string msg =
  310. "\r\nPlease go to\r\n"
  311. "https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html"
  312. "\r\n";
  313. msg += "to download a pre-trained model.\r\n\r\n";
  314. msg +=
  315. "We use the following model as an example to show you how "
  316. "to do "
  317. "that.\r\n";
  318. msg +=
  319. "https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/"
  320. "zipformer-transucer-models.html#csukuangfj-sherpa-ncnn-"
  321. "streaming-zipformer-bilingual-zh-en-2023-02-13-bilingual-"
  322. "chinese-english";
  323. msg += "\r\n\r\n";
  324. msg +=
  325. "wget "
  326. "https://huggingface.co/csukuangfj/"
  327. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  328. "13/resolve/main/encoder_jit_trace-pnnx.ncnn.param\r\n";
  329. msg +=
  330. "wget "
  331. "https://huggingface.co/csukuangfj/"
  332. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  333. "13/resolve/main/encoder_jit_trace-pnnx.ncnn.bin\r\n";
  334. msg +=
  335. "wget "
  336. "https://huggingface.co/csukuangfj/"
  337. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  338. "13/resolve/main/decoder_jit_trace-pnnx.ncnn.param\r\n";
  339. msg +=
  340. "wget "
  341. "https://huggingface.co/csukuangfj/"
  342. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  343. "13/resolve/main/decoder_jit_trace-pnnx.ncnn.bin\r\n";
  344. msg +=
  345. "wget "
  346. "https://huggingface.co/csukuangfj/"
  347. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  348. "13/resolve/main/joiner_jit_trace-pnnx.ncnn.param\r\n";
  349. msg +=
  350. "wget "
  351. "https://huggingface.co/csukuangfj/"
  352. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  353. "13/resolve/main/joiner_jit_trace-pnnx.ncnn.bin\r\n";
  354. msg +=
  355. "https://huggingface.co/csukuangfj/"
  356. "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
  357. "13/resolve/main/tokens.txt\r\n";
  358. msg += "\r\n\r\nThat's it!\r\n";
  359. AppendLineToMultilineEditCtrl(msg);
  360. return;
  361. }
  362. SherpaNcnnRecognizerConfig config;
  363. memset(&config, 0, sizeof(config));
  364. config.model_config.num_threads = 1;
  365. config.decoder_config.decoding_method = "greedy_search";
  366. config.decoder_config.num_active_paths = 4;
  367. config.feat_config.sampling_rate = 16000;
  368. config.feat_config.feature_dim = 80;
  369. config.enable_endpoint = 1;
  370. config.rule1_min_trailing_silence = 1.2f;
  371. config.rule2_min_trailing_silence = 0.8f;
  372. config.rule3_min_utterance_length = 300.0f;
  373. config.model_config.tokens = tokens.c_str();
  374. config.model_config.encoder_param = encoder_param.c_str();
  375. config.model_config.encoder_bin = encoder_bin.c_str();
  376. config.model_config.decoder_param = decoder_param.c_str();
  377. config.model_config.decoder_bin = decoder_bin.c_str();
  378. config.model_config.joiner_param = joiner_param.c_str();
  379. config.model_config.joiner_bin = joiner_bin.c_str();
  380. recognizer_ = CreateRecognizer(&config);
  381. }
  382. static std::string Cat(const std::vector<std::string> &results,
  383. const std::string &s) {
  384. std::ostringstream os;
  385. std::string sep;
  386. int i = 0;
  387. for (i = 0; i != results.size(); ++i) {
  388. os << sep << i << ": " << results[i];
  389. sep = "\r\n";
  390. }
  391. if (!s.empty()) {
  392. os << sep << i << ": " << s;
  393. }
  394. return os.str();
  395. }
  396. int CRealtimeSpeechRecognitionDlg::RunThread() {
  397. std::vector<std::string> results;
  398. std::string last_text;
  399. while (started_) {
  400. while (IsReady(recognizer_, stream_)) {
  401. Decode(recognizer_, stream_);
  402. }
  403. auto r = GetResult(recognizer_, stream_);
  404. std::string text = r->text;
  405. DestroyResult(r);
  406. if (!text.empty() && last_text != text) {
  407. // CString str;
  408. // str.Format(_T("%s"), Cat(results, text).c_str());
  409. auto str = Utf8ToUtf16(Cat(results, text).c_str());
  410. my_text_.SetWindowText(str.c_str());
  411. my_text_.SetFocus();
  412. my_text_.SetSel(-1);
  413. last_text = text;
  414. }
  415. int is_endpoint = IsEndpoint(recognizer_, stream_);
  416. if (is_endpoint) {
  417. Reset(recognizer_, stream_);
  418. if (!text.empty()) {
  419. results.push_back(std::move(text));
  420. }
  421. }
  422. Pa_Sleep(100); // sleep for 100ms
  423. }
  424. return 0;
  425. }