123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489 |
- // RealtimeSpeechRecognitionDlg.cpp : implementation file
- //
- // clang-format off
- #include "pch.h"
- #include "framework.h"
- #include "afxdialogex.h"
- // clang-format on
- #include "RealtimeSpeechRecognitionDlg.h"
- #include <fstream>
- #include <sstream>
- #include <string>
- #include <vector>
- #include "RealtimeSpeechRecognition.h"
- #ifdef _DEBUG
- #define new DEBUG_NEW
- #endif
- Microphone::Microphone() {
- PaError err = Pa_Initialize();
- if (err != paNoError) {
- fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
- exit(-2);
- }
- }
- Microphone::~Microphone() {
- PaError err = Pa_Terminate();
- if (err != paNoError) {
- fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
- exit(-2);
- }
- }
- // CRealtimeSpeechRecognitionDlg dialog
- CRealtimeSpeechRecognitionDlg::CRealtimeSpeechRecognitionDlg(
- CWnd *pParent /*=nullptr*/)
- : CDialogEx(IDD_REALTIMESPEECHRECOGNITION_DIALOG, pParent) {
- m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME);
- }
- CRealtimeSpeechRecognitionDlg::~CRealtimeSpeechRecognitionDlg() {
- if (recognizer_) {
- DestroyRecognizer(recognizer_);
- recognizer_ = nullptr;
- }
- }
- void CRealtimeSpeechRecognitionDlg::DoDataExchange(CDataExchange *pDX) {
- CDialogEx::DoDataExchange(pDX);
- DDX_Control(pDX, IDOK, my_btn_);
- DDX_Control(pDX, IDC_EDIT1, my_text_);
- }
- BEGIN_MESSAGE_MAP(CRealtimeSpeechRecognitionDlg, CDialogEx)
- ON_WM_PAINT()
- ON_WM_QUERYDRAGICON()
- ON_BN_CLICKED(IDOK, &CRealtimeSpeechRecognitionDlg::OnBnClickedOk)
- END_MESSAGE_MAP()
- // CRealtimeSpeechRecognitionDlg message handlers
- BOOL CRealtimeSpeechRecognitionDlg::OnInitDialog() {
- CDialogEx::OnInitDialog();
- // Set the icon for this dialog. The framework does this automatically
- // when the application's main window is not a dialog
- SetIcon(m_hIcon, TRUE); // Set big icon
- SetIcon(m_hIcon, FALSE); // Set small icon
- // TODO: Add extra initialization here
- InitMicrophone();
- return TRUE; // return TRUE unless you set the focus to a control
- }
- // If you add a minimize button to your dialog, you will need the code below
- // to draw the icon. For MFC applications using the document/view model,
- // this is automatically done for you by the framework.
- void CRealtimeSpeechRecognitionDlg::OnPaint() {
- if (IsIconic()) {
- CPaintDC dc(this); // device context for painting
- SendMessage(WM_ICONERASEBKGND, reinterpret_cast<WPARAM>(dc.GetSafeHdc()),
- 0);
- // Center icon in client rectangle
- int cxIcon = GetSystemMetrics(SM_CXICON);
- int cyIcon = GetSystemMetrics(SM_CYICON);
- CRect rect;
- GetClientRect(&rect);
- int x = (rect.Width() - cxIcon + 1) / 2;
- int y = (rect.Height() - cyIcon + 1) / 2;
- // Draw the icon
- dc.DrawIcon(x, y, m_hIcon);
- } else {
- CDialogEx::OnPaint();
- }
- }
- // The system calls this function to obtain the cursor to display while the user
- // drags
- // the minimized window.
- HCURSOR CRealtimeSpeechRecognitionDlg::OnQueryDragIcon() {
- return static_cast<HCURSOR>(m_hIcon);
- }
- // see
- // https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
- static std::wstring Utf8ToUtf16(const std::string &utf8) {
- std::vector<unsigned long> unicode;
- size_t i = 0;
- while (i < utf8.size()) {
- unsigned long uni;
- size_t todo;
- bool error = false;
- unsigned char ch = utf8[i++];
- if (ch <= 0x7F) {
- uni = ch;
- todo = 0;
- } else if (ch <= 0xBF) {
- throw std::logic_error("not a UTF-8 string");
- } else if (ch <= 0xDF) {
- uni = ch & 0x1F;
- todo = 1;
- } else if (ch <= 0xEF) {
- uni = ch & 0x0F;
- todo = 2;
- } else if (ch <= 0xF7) {
- uni = ch & 0x07;
- todo = 3;
- } else {
- throw std::logic_error("not a UTF-8 string");
- }
- for (size_t j = 0; j < todo; ++j) {
- if (i == utf8.size()) throw std::logic_error("not a UTF-8 string");
- unsigned char ch = utf8[i++];
- if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string");
- uni <<= 6;
- uni += ch & 0x3F;
- }
- if (uni >= 0xD800 && uni <= 0xDFFF)
- throw std::logic_error("not a UTF-8 string");
- if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string");
- unicode.push_back(uni);
- }
- std::wstring utf16;
- for (size_t i = 0; i < unicode.size(); ++i) {
- unsigned long uni = unicode[i];
- if (uni <= 0xFFFF) {
- utf16 += (wchar_t)uni;
- } else {
- uni -= 0x10000;
- utf16 += (wchar_t)((uni >> 10) + 0xD800);
- utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);
- }
- }
- return utf16;
- }
- void CRealtimeSpeechRecognitionDlg::AppendTextToEditCtrl(const std::string &s) {
- // get the initial text length
- int nLength = my_text_.GetWindowTextLength();
- // put the selection at the end of text
- my_text_.SetSel(nLength, nLength);
- // replace the selection
- std::wstring wstr = Utf8ToUtf16(s);
- // my_text_.ReplaceSel(wstr.c_str());
- my_text_.ReplaceSel(wstr.c_str());
- }
- void CRealtimeSpeechRecognitionDlg::AppendLineToMultilineEditCtrl(
- const std::string &s) {
- AppendTextToEditCtrl("\r\n" + s);
- }
- void CRealtimeSpeechRecognitionDlg::InitMicrophone() {
- int default_device = Pa_GetDefaultInputDevice();
- int device_count = Pa_GetDeviceCount();
- if (default_device == paNoDevice) {
- // CString str;
- // str.Format(_T("No default input device found!"));
- // AfxMessageBox(str, MB_OK | MB_ICONSTOP);
- // exit(-1);
- AppendLineToMultilineEditCtrl("No default input device found!");
- my_btn_.EnableWindow(FALSE);
- return;
- }
- AppendLineToMultilineEditCtrl(std::string("Selected device ") +
- Pa_GetDeviceInfo(default_device)->name);
- }
- static int32_t RecordCallback(const void *input_buffer,
- void * /*output_buffer*/,
- unsigned long frames_per_buffer, // NOLINT
- const PaStreamCallbackTimeInfo * /*time_info*/,
- PaStreamCallbackFlags /*status_flags*/,
- void *user_data) {
- auto dlg = reinterpret_cast<CRealtimeSpeechRecognitionDlg *>(user_data);
- auto stream = dlg->stream_;
- if (stream) {
- AcceptWaveform(stream, 16000, reinterpret_cast<const float *>(input_buffer),
- frames_per_buffer);
- }
- return dlg->started_ ? paContinue : paComplete;
- }
- void CRealtimeSpeechRecognitionDlg::OnBnClickedOk() {
- if (!recognizer_) {
- AppendLineToMultilineEditCtrl("Creating recognizer...");
- InitRecognizer();
- if (!recognizer_) {
- // failed to create the recognizer
- return;
- }
- AppendLineToMultilineEditCtrl("Recognizer created!");
- }
- if (!started_) {
- started_ = true;
- if (stream_) {
- DestroyStream(stream_);
- stream_ = nullptr;
- }
- stream_ = CreateStream(recognizer_);
- PaStreamParameters param;
- param.device = Pa_GetDefaultInputDevice();
- const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
- param.channelCount = 1;
- param.sampleFormat = paFloat32;
- param.suggestedLatency = info->defaultLowInputLatency;
- param.hostApiSpecificStreamInfo = nullptr;
- float sample_rate = 16000;
- pa_stream_ = nullptr;
- PaError err =
- Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */
- sample_rate,
- 0, // frames per buffer
- paClipOff, // we won't output out of range samples
- // so don't bother clipping them
- RecordCallback, this);
- if (err != paNoError) {
- AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
- Pa_GetErrorText(err));
- my_btn_.EnableWindow(FALSE);
- return;
- }
- err = Pa_StartStream(pa_stream_);
- if (err != paNoError) {
- AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
- Pa_GetErrorText(err));
- my_btn_.EnableWindow(FALSE);
- return;
- }
- AppendLineToMultilineEditCtrl("Started! Please speak");
- my_btn_.SetWindowText(_T("Stop"));
- thread_ = new RecognizerThread(this);
- thread_->CreateThread(CREATE_SUSPENDED);
- thread_->m_bAutoDelete = false; // Let me delete it.
- thread_->ResumeThread();
- } else {
- started_ = false;
- Pa_Sleep(200); // sleep for 200ms
- if (pa_stream_) {
- PaError err = Pa_CloseStream(pa_stream_);
- if (err != paNoError) {
- AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
- Pa_GetErrorText(err));
- my_btn_.EnableWindow(FALSE);
- return;
- }
- }
- pa_stream_ = nullptr;
- WaitForSingleObject(thread_->m_hThread, INFINITE);
- delete thread_;
- thread_ = nullptr;
- // AfxMessageBox("stopped", MB_OK);
- my_btn_.SetWindowText(_T("Start"));
- AppendLineToMultilineEditCtrl("Stopped");
- }
- }
- bool CRealtimeSpeechRecognitionDlg::Exists(const std::string &filename) {
- std::ifstream is(filename);
- return is.good();
- }
- void CRealtimeSpeechRecognitionDlg::InitRecognizer() {
- std::string encoder_param = "./encoder_jit_trace-pnnx.ncnn.param";
- std::string encoder_bin = "./encoder_jit_trace-pnnx.ncnn.bin";
- std::string decoder_param = "./decoder_jit_trace-pnnx.ncnn.param";
- std::string decoder_bin = "./decoder_jit_trace-pnnx.ncnn.bin";
- std::string joiner_param = "./joiner_jit_trace-pnnx.ncnn.param";
- std::string joiner_bin = "./joiner_jit_trace-pnnx.ncnn.bin";
- std::string tokens = "./tokens.txt";
- bool is_ok = true;
- if (!Exists(encoder_param)) {
- std::string msg = encoder_param + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(encoder_bin)) {
- std::string msg = encoder_bin + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(decoder_param)) {
- std::string msg = decoder_param + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(decoder_bin)) {
- std::string msg = decoder_bin + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(joiner_param)) {
- std::string msg = joiner_param + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(joiner_bin)) {
- std::string msg = joiner_bin + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!Exists(tokens)) {
- std::string msg = tokens + " does not exist!";
- AppendLineToMultilineEditCtrl(msg);
- is_ok = false;
- }
- if (!is_ok) {
- my_btn_.EnableWindow(FALSE);
- std::string msg =
- "\r\nPlease go to\r\n"
- "https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html"
- "\r\n";
- msg += "to download a pre-trained model.\r\n\r\n";
- msg +=
- "We use the following model as an example to show you how "
- "to do "
- "that.\r\n";
- msg +=
- "https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/"
- "zipformer-transucer-models.html#csukuangfj-sherpa-ncnn-"
- "streaming-zipformer-bilingual-zh-en-2023-02-13-bilingual-"
- "chinese-english";
- msg += "\r\n\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/encoder_jit_trace-pnnx.ncnn.param\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/encoder_jit_trace-pnnx.ncnn.bin\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/decoder_jit_trace-pnnx.ncnn.param\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/decoder_jit_trace-pnnx.ncnn.bin\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/joiner_jit_trace-pnnx.ncnn.param\r\n";
- msg +=
- "wget "
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/joiner_jit_trace-pnnx.ncnn.bin\r\n";
- msg +=
- "https://huggingface.co/csukuangfj/"
- "sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-"
- "13/resolve/main/tokens.txt\r\n";
- msg += "\r\n\r\nThat's it!\r\n";
- AppendLineToMultilineEditCtrl(msg);
- return;
- }
- SherpaNcnnRecognizerConfig config;
- memset(&config, 0, sizeof(config));
- config.model_config.num_threads = 1;
- config.decoder_config.decoding_method = "greedy_search";
- config.decoder_config.num_active_paths = 4;
- config.feat_config.sampling_rate = 16000;
- config.feat_config.feature_dim = 80;
- config.enable_endpoint = 1;
- config.rule1_min_trailing_silence = 1.2f;
- config.rule2_min_trailing_silence = 0.8f;
- config.rule3_min_utterance_length = 300.0f;
- config.model_config.tokens = tokens.c_str();
- config.model_config.encoder_param = encoder_param.c_str();
- config.model_config.encoder_bin = encoder_bin.c_str();
- config.model_config.decoder_param = decoder_param.c_str();
- config.model_config.decoder_bin = decoder_bin.c_str();
- config.model_config.joiner_param = joiner_param.c_str();
- config.model_config.joiner_bin = joiner_bin.c_str();
- recognizer_ = CreateRecognizer(&config);
- }
- static std::string Cat(const std::vector<std::string> &results,
- const std::string &s) {
- std::ostringstream os;
- std::string sep;
- int i = 0;
- for (i = 0; i != results.size(); ++i) {
- os << sep << i << ": " << results[i];
- sep = "\r\n";
- }
- if (!s.empty()) {
- os << sep << i << ": " << s;
- }
- return os.str();
- }
- int CRealtimeSpeechRecognitionDlg::RunThread() {
- std::vector<std::string> results;
- std::string last_text;
- while (started_) {
- while (IsReady(recognizer_, stream_)) {
- Decode(recognizer_, stream_);
- }
- auto r = GetResult(recognizer_, stream_);
- std::string text = r->text;
- DestroyResult(r);
- if (!text.empty() && last_text != text) {
- // CString str;
- // str.Format(_T("%s"), Cat(results, text).c_str());
- auto str = Utf8ToUtf16(Cat(results, text).c_str());
- my_text_.SetWindowText(str.c_str());
- my_text_.SetFocus();
- my_text_.SetSel(-1);
- last_text = text;
- }
- int is_endpoint = IsEndpoint(recognizer_, stream_);
- if (is_endpoint) {
- Reset(recognizer_, stream_);
- if (!text.empty()) {
- results.push_back(std::move(text));
- }
- }
- Pa_Sleep(100); // sleep for 100ms
- }
- return 0;
- }
|