<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">ntv</journal-id><journal-title-group><journal-title xml:lang="ru">Научно-технический вестник информационных технологий, механики и оптики</journal-title><trans-title-group xml:lang="en"><trans-title>Scientific and Technical Journal of Information Technologies, Mechanics and Optics</trans-title></trans-title-group></journal-title-group><issn pub-type="ppub">2226-1494</issn><issn pub-type="epub">2500-0373</issn><publisher><publisher-name>Университет ИТМО</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.17586/2226-1494-2023-23-1-88-95</article-id><article-id custom-type="elpub" pub-id-type="custom">ntv-334</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>КОМПЬЮТЕРНЫЕ СИСТЕМЫ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="en"><subject>COMPUTER SCIENCE</subject></subj-group></article-categories><title-group><article-title>Диалоговая система на основе устных разговоров с доступом к неструктурированной базе знаний</article-title><trans-title-group xml:lang="en"><trans-title>Dialogue system based on spoken conversations with access to an unstructured knowledge base</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-9054-5252</contrib-id><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Маслюхин</surname><given-names>С. М.</given-names></name><name name-style="western" xml:lang="en"><surname>Masliukhin</surname><given-names>S. M.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Маслюхин Сергей Михайлович - ведущий научный сотрудник; инженер</p><p>Санкт-Петербург, 194044</p><p>Санкт-Петербург, 197101</p></bio><bio xml:lang="en"><p>Sergei M. Masliukhin - Leading Researcher; Engineer</p><p>Saint Petersburg, 194044</p><p>Saint Petersburg, 197101</p></bio><email xlink:type="simple">maslyukhin@speechpro.com</email><xref ref-type="aff" rid="aff-1"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru">ООО «ЦРТ-инновации»; Университет ИТМО<country>Россия</country></aff><aff xml:lang="en">STC-innovations Limited; ITMO University<country>Russian Federation</country></aff></aff-alternatives><pub-date pub-type="collection"><year>2023</year></pub-date><pub-date pub-type="epub"><day>18</day><month>12</month><year>2024</year></pub-date><volume>23</volume><issue>1</issue><fpage>88</fpage><lpage>95</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Маслюхин С.М., 2024</copyright-statement><copyright-year>2024</copyright-year><copyright-holder xml:lang="ru">Маслюхин С.М.</copyright-holder><copyright-holder xml:lang="en">Masliukhin S.M.</copyright-holder><license license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://ntv.elpub.ru/jour/article/view/334">https://ntv.elpub.ru/jour/article/view/334</self-uri><abstract><p>Предмет исследования. Представлен подход к построению задачно-ориентированной диалоговой системы (разговорного агента) с доступом к неструктурированной базе знаний на основе устных разговоров с применением аугментации письменной речи, имитирующей результаты распознавания устной речи, комбинирования предсказаний классификаторов, генерации текста, дополненной поиском.Метод. Предложенный подход предусматривает аугментацию обучающих данных двумя способами: преобразованием текста в речь и обратно с помощью систем синтеза и распознавания речи; заменой части слов на основе матрицы спутываний системы распознавания речи. Диалоговая система с доступом к неструктурированной базе знаний решает задачу обнаружения высказывания, для которого необходим поиск дополнительной информации в неструктурированной базе знаний. С этой целью выполнено обучение моделей Support Vector Machine, Convolutional Neural Network, Bidirectional Encoder Representations from Transformers и Generative Pre-trained Transformer 2. Лучшие из представленных моделей использованы при формировании предсказания путем взвешенного комбинирования. Осуществлен выбор подходящего текстового фрагмента из базы знаний и генерация обоснованного ответа. Поставленные задачи решены путем адаптации модели генерации текста, аугментированной поиском Retrieval Augmented Generation.Основные результаты. Выполнена апробация подхода на данных конкурса 10th Dialogue System Technology Challenge (DSTC10). По всем метрикам, кроме Precision, новый подход значительно превзошел результаты базовых моделей, предложенных организаторами конкурса DSTC10.Практическая значимость. Результаты работы могут найти применение при создании чат-бот систем, обеспечивающих автоматическую обработку обращений пользователей на естественном языке на основе неструктурированной базы знаний, например базы ответов на часто задаваемые вопросы.</p></abstract><trans-abstract xml:lang="en"><p>This paper describes an approach for constructing a task-oriented dialog system (a conversational agent) with an unstructured knowledge access based on spoken conversations including: written speech augmentation that simulates the speech recognition results; combination of classifiers; retrieval augmented text generation. The proposed approach provides the training data augmentation in two ways: by converting the original texts into sound waves by a text-tospeech model and then transforming back into texts by an automated speech recognition model; injecting artificially generated errors based on phonetic similarity. A dialogue system with access to the unstructured knowledge base solves the task of detecting a turn, which requires searching for additional information in an unstructured knowledge base. For this purpose, the Support Vector Machine, Convolutional Neural Network, Bidirectional Encoder Representations from Transformers, and Generative Pre-trained Transformer 2 models were trained. The best of the presented models are used in the weighted combination. Next, a suitable text fragment is selected from the knowledge base and a reasonable answer is generated. The tasks are solved by adapting the retrieval augmented text generation model Retrieval Augmented Generation. The proposed method tested on the data from the 10th Dialogue System Technology Challenge. In all metrics, except Precision, the new approach significantly outperformed the results of the basic models proposed by the organizers of the competition. The results of the work can be used to create chat-bot systems that provide automatic processing of user requests in natural language based on an unstructured knowledge access, such as a database of answers to frequently asked questions.</p></trans-abstract><kwd-group xml:lang="ru"><kwd>диалоговые системы</kwd><kwd>разговорные агенты</kwd><kwd>поиск информации</kwd><kwd>текстовая аугментация</kwd><kwd>генерация</kwd><kwd>аугментированная поиском</kwd></kwd-group><kwd-group xml:lang="en"><kwd>dialogue systems</kwd><kwd>conversational agents</kwd><kwd>information retrieval</kwd><kwd>text augmentation</kwd><kwd>retrieval augmented generation</kwd></kwd-group><funding-group xml:lang="ru"><funding-statement>Исследование выполнено за счет гранта Российского научного фонда (№ 22-11-00128, https://rscf.ru/project/22-11-00128/).</funding-statement></funding-group><funding-group xml:lang="en"><funding-statement>This research is financially supported by the Russian Science Foundation (No. 22-11-00128, https://rscf.ru/ project/22-11-00128/).</funding-statement></funding-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Moghe N., Arora S., Banerjee S., Khapra M.M. Towards exploiting background knowledge for building conversation systems // Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing. 2018. P. 2322–2332. https://doi.org/10.18653/v1/D18-1255</mixed-citation><mixed-citation xml:lang="en">Moghe N., Arora S., Banerjee S., Khapra M.M. Towards exploiting background knowledge for building conversation systems. Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, pp. 2322–2332. https://doi.org/10.18653/v1/D18-1255</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Dinan E., Roller S., Shuster K., Fan A., Auli M., Weston J. Wizard of wikipedia: Knowledge-powered conversational agents // arXiv. 2019. arXiv:1811.01241. https://doi.org/10.48550/arXiv.1811.01241</mixed-citation><mixed-citation xml:lang="en">Dinan E., Roller S., Shuster K., Fan A., Auli M., Weston J. Wizard of wikipedia: Knowledge-powered conversational agents. arXiv, 2019, arXiv:1811.01241. https://doi.org/10.48550/arXiv.1811.01241</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Zhou K., Prabhumoye S., Black A.W. A dataset for document grounded conversations // Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing. 2018. P. 708–713. https:// doi.org/10.18653/v1/D18-1076</mixed-citation><mixed-citation xml:lang="en">Zhou K., Prabhumoye S., Black A.W. A dataset for document grounded conversations. Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, pp. 708–713. https:// doi.org/10.18653/v1/D18-1076</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Hearst M., Dumais S., Osuna E., Platt J. Scholkopf B. Support vector machines // IEEE Intelligent Systems and their Applications. 1998. V. 13. N 4. P. 18–28. https://doi.org/10.1109/5254.708428</mixed-citation><mixed-citation xml:lang="en">Hearst M., Dumais S., Osuna E., Platt J. Scholkopf B. Support vector machines. IEEE Intelligent Systems and their Applications, 1998, vol. 13, no. 4, pp. 18–28. https://doi.org/10.1109/5254.708428</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Johnson R., Zhang T. Convolutional neural networks for text categorization: Shallow word-level vs. deep character-level // ArXiv. 2016. arXiv:1609.00718. https://doi.org/10.48550/arXiv.1609.00718</mixed-citation><mixed-citation xml:lang="en">Johnson R., Zhang T. Convolutional neural networks for text categorization: Shallow word-level vs. deep character-level. ArXiv, 2016, arXiv:1609.00718. https://doi.org/10.48550/arXiv.1609.00718</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Devlin J., Chang M.-W., Lee K., Toutanova K. BERT: Pre-training of deep bidirectional transformers for language understanding // Proc. of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Vol. 1 (Long and Short Papers). 2019. P. 4171–4186. https://doi.org/10.18653/v1/N19-1423</mixed-citation><mixed-citation xml:lang="en">Devlin J., Chang M.-W., Lee K., Toutanova K. BERT: Pre-training of deep bidirectional transformers for language understanding. Proc. of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Vol. 1 (Long and Short Papers), 2019, pp. 4171–4186. https://doi.org/10.18653/v1/N19-1423</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Radford A., Narasimhan K., Salimans T., Sutskever I. Improving language understanding by generative pre-training: preprint. 2018.</mixed-citation><mixed-citation xml:lang="en">Radford A., Narasimhan K., Salimans T., Sutskever I. Improving language understanding by generative pre-training. Preprint. 2018.</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Karpukhin V., Oğuz B., Min S., Lewis P., Wu L., Edunov S., Chen D., Yih W.-T. Dense passage retrieval for open-domain question answering // Proc. of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2020. P. 6769–6781. https:// doi.org/10.18653/v1/2020.emnlp-main.550</mixed-citation><mixed-citation xml:lang="en">Karpukhin V., Oğuz B., Min S., Lewis P., Wu L., Edunov S., Chen D., Yih W.-T. Dense passage retrieval for open-domain question answering. Proc. of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2020, pp. 6769–6781. https://doi.org/10.18653/v1/2020.emnlp-main.550</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Humeau S., Shuster K., Lachaux M., Weston J. Poly-encoders: Architectures and pre-training strategies for fast and accurate multisentence scoring // arXiv. 2020. arXiv:1905.01969. https://doi.org/10.48550/arXiv.1905.01969</mixed-citation><mixed-citation xml:lang="en">Humeau S., Shuster K., Lachaux M., Weston J. Poly-encoders: Architectures and pre-training strategies for fast and accurate multisentence scoring. arXiv, 2020, arXiv:1905.01969. https://doi.org/10.48550/arXiv.1905.01969</mixed-citation></citation-alternatives></ref><ref id="cit10"><label>10</label><citation-alternatives><mixed-citation xml:lang="ru">Lewis M., Liu Y., Goyal N., Ghazvininejad M., Mohamed A., Levy O., Stoyanov V., Zettlemoyer L. BART: Denoising sequenceto- sequence pre-training for natural language generation, translation, and comprehension // Proc. of the 58th Annual Meeting of the Association for Computational Linguistics. 2020. P. 7871–7880. https://doi.org/10.18653/v1/2020.acl-main.703</mixed-citation><mixed-citation xml:lang="en">Lewis M., Liu Y., Goyal N., Ghazvininejad M., Mohamed A., Levy O., Stoyanov V., Zettlemoyer L. BART: Denoising sequenceto-sequence pre-training for natural language generation, translation, and comprehension. Proc. of the 58th Annual Meeting of the Association for Computational Linguistics, 2020, pp. 7871–7880. https://doi.org/10.18653/v1/2020.acl-main.703</mixed-citation></citation-alternatives></ref><ref id="cit11"><label>11</label><citation-alternatives><mixed-citation xml:lang="ru">Kim S., Liu Y., Jin D., Papangelis A., Hedayatnia B., Gopalakrishnan K., Hakkani-Tur D. DSTC10 Track Proposal: Knowledge-grounded Task-oriented Dialogue Modeling on Spoken Conversations. 2021.</mixed-citation><mixed-citation xml:lang="en">Kim S., Liu Y., Jin D., Papangelis A., Hedayatnia B., Gopalakrishnan K., Hakkani-Tur D. DSTC10 Track Proposal: Knowledge-grounded Task-oriented Dialogue Modeling on Spoken Conversations. 2021.</mixed-citation></citation-alternatives></ref><ref id="cit12"><label>12</label><citation-alternatives><mixed-citation xml:lang="ru">Kim S., Eric M., Gopalakrishnan K., Hedayatnia B., Liu Y. Hakkani-Tur D.Z. Beyond domain APIs: task-oriented conversational modeling with unstructured knowledge access // Proc. of the 21st Annual Meeting of the Special Interest Group on Discourse and Dialogue. 2020. P. 278–289.</mixed-citation><mixed-citation xml:lang="en">Kim S., Eric M., Gopalakrishnan K., Hedayatnia B., Liu Y. Hakkani-Tur D.Z. Beyond domain APIs: task-oriented conversational modeling with unstructured knowledge access. Proc. of the 21st Annual Meeting of the Special Interest Group on Discourse and Dialogue, 2020, pp. 278–289.</mixed-citation></citation-alternatives></ref><ref id="cit13"><label>13</label><citation-alternatives><mixed-citation xml:lang="ru">Budzianowski P., Wen T.-H., Tseng B.-H., Casanueva I., Ultes S., Ramadan O., Gašić M. MultiWOZ - A large-scale multi-domain wizard-of-oz dataset for task-oriented dialogue modelling // Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing. 2018. P. 5016–5026. https://doi.org/10.18653/v1/D18-1547</mixed-citation><mixed-citation xml:lang="en">Budzianowski P., Wen T.-H., Tseng B.-H., Casanueva I., Ultes S., Ramadan O., Gašić M. MultiWOZ - A large-scale multi-domain wizard-of-oz dataset for task-oriented dialogue modelling. Proc. of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018, pp. 5016–5026. https://doi.org/10.18653/v1/D18-1547</mixed-citation></citation-alternatives></ref><ref id="cit14"><label>14</label><citation-alternatives><mixed-citation xml:lang="ru">Eric M., Goel R., Paul S., Sethi A., Agarwal S., Gao S., Kumar A., Goyal A., Ku P., Hakkani-Tür D. Multiwoz 2.1: Multi-domain dialogue state corrections and state tracking baselines // Proc. of the Twelfth Language Resources and Evaluation Conference. 2020. P. 422–428.</mixed-citation><mixed-citation xml:lang="en">Eric M., Goel R., Paul S., Sethi A., Agarwal S., Gao S., Kumar A., Goyal A., Ku P., Hakkani-Tür D. Multiwoz 2.1: Multi-domain dialogue state corrections and state tracking baselines. Proc. of the Twelfth Language Resources and Evaluation Conference, 2020, pp. 422–428.</mixed-citation></citation-alternatives></ref><ref id="cit15"><label>15</label><citation-alternatives><mixed-citation xml:lang="ru">Zang X., Rastogi A., Sunkara S., Gupta R., Zhang J., Chen J. MultiWOZ 2.2: A dialogue dataset with additional annotation corrections and state tracking baselines // Proc. of the 2nd Workshop on Natural Language Processing for Conversational AI. 2020. P. 109–117. https://doi.org/10.18653/v1/2020.nlp4convai-1.13</mixed-citation><mixed-citation xml:lang="en">Zang X., Rastogi A., Sunkara S., Gupta R., Zhang J., Chen J. MultiWOZ 2.2: A dialogue dataset with additional annotation corrections and state tracking baselines. Proc. of the 2nd Workshop on Natural Language Processing for Conversational AI, 2020, pp. 109–117. https://doi.org/10.18653/v1/2020.nlp4convai-1.13</mixed-citation></citation-alternatives></ref><ref id="cit16"><label>16</label><citation-alternatives><mixed-citation xml:lang="ru">Baevski A., Zhou H., Mohamed A., Auli M. Wav2vec 2.0: a framework for self-supervised learning of speech representations // Proc. of the 34th International Conference on Neural Information Processing Systems (NIPS’20). 2020. P. 12449–12460.</mixed-citation><mixed-citation xml:lang="en">Baevski A., Zhou H., Mohamed A., Auli M. Wav2vec 2.0: a framework for self-supervised learning of speech representations. Proc. of the 34th International Conference on Neural Information Processing Systems (NIPS’20), 2020, pp. 12449–12460.</mixed-citation></citation-alternatives></ref><ref id="cit17"><label>17</label><citation-alternatives><mixed-citation xml:lang="ru">Panayotov V., Chen G., Povey D., Khudanpur S., Librispeech: An ASR corpus based on public domain audio books // Proc. of the 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2015. P. 5206–5210. https://doi.org/10.1109/ICASSP.2015.7178964</mixed-citation><mixed-citation xml:lang="en">Panayotov V., Chen G., Povey D., Khudanpur S., Librispeech: An ASR corpus based on public domain audio books. Proc. of the 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2015, pp. 5206–5210. https://doi.org/10.1109/ICASSP.2015.7178964</mixed-citation></citation-alternatives></ref><ref id="cit18"><label>18</label><citation-alternatives><mixed-citation xml:lang="ru">Heafield K. KenLM: Faster and Smaller language model queries // Proc. of the Sixth Workshop on Statistical Machine Translation. 2011. P. 187–197.</mixed-citation><mixed-citation xml:lang="en">Heafield K. KenLM: Faster and smaller language model queries. Proc. of the Sixth Workshop on Statistical Machine Translation, 2011, pp. 187–197.</mixed-citation></citation-alternatives></ref><ref id="cit19"><label>19</label><citation-alternatives><mixed-citation xml:lang="ru">Gopalakrishnan K., Hedayatnia B., Wang L., Liu Y., Hakkani-Tür D. Are neural open-domain dialog systems robust to speech recognition errors in the dialog history? an empirical study // Proc. Interspeech 2020. 2020. P. 911–915. https://doi.org/10.21437/Interspeech.2020-1508</mixed-citation><mixed-citation xml:lang="en">Gopalakrishnan K., Hedayatnia B., Wang L., Liu Y., Hakkani-Tür D. Are neural open-domain dialog systems robust to speech recognition errors in the dialog history? an empirical study. Proc. Interspeech 2020, 2020, pp. 911–915. https://doi.org/10.21437/Interspeech.2020-1508</mixed-citation></citation-alternatives></ref><ref id="cit20"><label>20</label><citation-alternatives><mixed-citation xml:lang="ru">Wang L., Fazel-Zarandi M., Tiwari A., Matsoukas S., Polymenakos L. Data Augmentation for Training Dialog Models Robust to Speech Recognition Errors // Proc. of the 2nd Workshop on Natural Language Processing for Conversational AI. 2020. P. 63–70. https://doi.org/10.18653/v1/2020.nlp4convai-1.8</mixed-citation><mixed-citation xml:lang="en">Wang L., Fazel-Zarandi M., Tiwari A., Matsoukas S., Polymenakos L. Data Augmentation for Training Dialog Models Robust to Speech Recognition Errors. Proc. of the 2nd Workshop on Natural Language Processing for Conversational AI, 2020, pp. 63–70. https://doi.org/10.18653/v1/2020.nlp4convai-1.8</mixed-citation></citation-alternatives></ref><ref id="cit21"><label>21</label><citation-alternatives><mixed-citation xml:lang="ru">Xu L., Lian J., Zhao W.X., Gong M., Shou L., Jiang D., Xie X., Wen J. Negative sampling for contrastive representation learning: A review // ArXiv. 2022. arXiv:2206.00212. https://doi.org/10.48550/arXiv.2206.00212</mixed-citation><mixed-citation xml:lang="en">Xu L., Lian J., Zhao W.X., Gong M., Shou L., Jiang D., Xie X., Wen J. Negative sampling for contrastive representation learning: A review. ArXiv, 2022, arXiv:2206.00212. https://doi.org/10.48550/arXiv.2206.00212</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
