<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE root>
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.2" xml:lang="en"><front><journal-meta><journal-id journal-id-type="publisher-id">Discrete and Continuous Models and Applied Computational Science</journal-id><journal-title-group><journal-title xml:lang="en">Discrete and Continuous Models and Applied Computational Science</journal-title><trans-title-group xml:lang="ru"><trans-title>Discrete and Continuous Models and Applied Computational Science</trans-title></trans-title-group></journal-title-group><issn publication-format="print">2658-4670</issn><issn publication-format="electronic">2658-7149</issn><publisher><publisher-name xml:lang="en">Peoples' Friendship University of Russia named after Patrice Lumumba (RUDN University)</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">34463</article-id><article-id pub-id-type="doi">10.22363/2658-4670-2023-31-1-64-74</article-id><article-id pub-id-type="edn">VNWSXI</article-id><article-categories><subj-group subj-group-type="toc-heading" xml:lang="en"><subject>Articles</subject></subj-group><subj-group subj-group-type="toc-heading" xml:lang="ru"><subject>Статьи</subject></subj-group><subj-group subj-group-type="article-type"><subject>Research Article</subject></subj-group></article-categories><title-group><article-title xml:lang="en">Methods of extracting biomedical information from patents and scientific publications (on the example of chemical compounds)</article-title><trans-title-group xml:lang="ru"><trans-title>Методы извлечения биомедицинских текстов из патентов и научных публикаций (на примере химических соединений)</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-1640-1357</contrib-id><name-alternatives><name xml:lang="en"><surname>Kolpakov</surname><given-names>Nikolay A.</given-names></name><name xml:lang="ru"><surname>Колпаков</surname><given-names>Н. А.</given-names></name></name-alternatives><bio xml:lang="en"><p>Master’s degree student of Phystech School of Applied Mathematics and Informatics</p></bio><email>kolpakov.na@phystech.edu</email><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-0039-943X</contrib-id><name-alternatives><name xml:lang="en"><surname>Molodchenkov</surname><given-names>Alexey I.</given-names></name><name xml:lang="ru"><surname>Молодченков</surname><given-names>А. И.</given-names></name></name-alternatives><bio xml:lang="en"><p>Candidate of Technical Sciences, Federal Research Center “Computer Science and Control” of RAS employee, employee of the Peoples’ Friendship University of Russia</p></bio><email>aim@tesyan.ru</email><xref ref-type="aff" rid="aff2"/><xref ref-type="aff" rid="aff3"/></contrib><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-4391-1958</contrib-id><name-alternatives><name xml:lang="en"><surname>Lukin</surname><given-names>Anton V.</given-names></name><name xml:lang="ru"><surname>Лукин</surname><given-names>А. В.</given-names></name></name-alternatives><bio xml:lang="en"><p>Federal Research Center “Computer Science and Control” of RAS employee, employee of the Peoples’ Friendship University of Russia</p></bio><email>antonvlukin@gmail.com</email><xref ref-type="aff" rid="aff2"/><xref ref-type="aff" rid="aff3"/></contrib></contrib-group><aff-alternatives id="aff1"><aff><institution xml:lang="en">Moscow Institute of Physics and Technology (MIPT)</institution></aff><aff><institution xml:lang="ru">Московский физико-технический институт</institution></aff></aff-alternatives><aff-alternatives id="aff2"><aff><institution xml:lang="en">Federal research center “Computer science and control” of RAS</institution></aff><aff><institution xml:lang="ru">Федеральный исследовательский центр «Информатика и управление» РАН</institution></aff></aff-alternatives><aff-alternatives id="aff3"><aff><institution xml:lang="en">Peoples’ Friendship University of Russia (RUDN University)</institution></aff><aff><institution xml:lang="ru">Российский университет дружбы народов</institution></aff></aff-alternatives><pub-date date-type="pub" iso-8601-date="2023-03-30" publication-format="electronic"><day>30</day><month>03</month><year>2023</year></pub-date><volume>31</volume><issue>1</issue><issue-title xml:lang="en">VOL 31, NO1 (2023)</issue-title><issue-title xml:lang="ru">ТОМ 31, №1 (2023)</issue-title><fpage>64</fpage><lpage>74</lpage><history><date date-type="received" iso-8601-date="2023-04-20"><day>20</day><month>04</month><year>2023</year></date></history><permissions><copyright-statement xml:lang="en">Copyright ©; 2023, Kolpakov N.A., Molodchenkov A.I., Lukin A.V.</copyright-statement><copyright-statement xml:lang="ru">Copyright ©; 2023, Колпаков Н.А., Молодченков А.И., Лукин А.В.</copyright-statement><copyright-year>2023</copyright-year><copyright-holder xml:lang="en">Kolpakov N.A., Molodchenkov A.I., Lukin A.V.</copyright-holder><copyright-holder xml:lang="ru">Колпаков Н.А., Молодченков А.И., Лукин А.В.</copyright-holder><ali:free_to_read xmlns:ali="http://www.niso.org/schemas/ali/1.0/"/><license><ali:license_ref xmlns:ali="http://www.niso.org/schemas/ali/1.0/">https://creativecommons.org/licenses/by-nc/4.0</ali:license_ref></license></permissions><self-uri xlink:href="https://journals.rudn.ru/miph/article/view/34463">https://journals.rudn.ru/miph/article/view/34463</self-uri><abstract xml:lang="en"><p style="text-align: justify;">This article proposes an algorithm for solving the problem of extracting information from biomedical patents and scientific publications. The introduced algorithm is based on machine learning methods. Experiments were carried out on patents from the USPTO database. Experiments have shown that the best extraction quality was achieved by a model based on BioBERT.</p></abstract><trans-abstract xml:lang="ru"><p style="text-align: justify;">В данной статье предложен алгоритм для решения задачи извлечения информации из биомедицинских патентов и научных публикаций. Представленный алгоритм основан на методах машинного обучения. Авторами были проведены эксперименты на патентах из базы USPTO. Эксперименты показали, что лучшее качество извлечения продемонстрировала модель, построенная на основе BioBERT.</p></trans-abstract><kwd-group xml:lang="en"><kwd>machine learning</kwd><kwd>natural language processing</kwd><kwd>named entity recognition</kwd><kwd>biomedical texts processing</kwd></kwd-group><kwd-group xml:lang="ru"><kwd>машинное обучение</kwd><kwd>обработка естественного языка</kwd><kwd>извлечение именованных сущностей</kwd><kwd>обработка биомедицинских текстов</kwd></kwd-group><funding-group/></article-meta></front><body></body><back><ref-list><ref id="B1"><label>1.</label><mixed-citation>S. A. Akhondi et al., “Automatic identification of relevant chemical compounds from patents,” Database: the journal of biological databases and curation, vol. 1, pp. 1-14, 2019. DOI: 10.1093/database/baz001.</mixed-citation></ref><ref id="B2"><label>2.</label><mixed-citation>D. Jessop, S. Adams, E. Willighagen, L. Hawizy, and P. Murray-Rust, “OSCAR4: A flexible architecture for chemical textmining,” Journal of cheminformatics, vol. 3, no. 1, pp. 1-12, 2011. DOI: 10.1186/17582946-3-41.</mixed-citation></ref><ref id="B3"><label>3.</label><mixed-citation>E. Soysal et al., “CLAMP - a toolkit for efficiently building customized clinical natural language processing pipelines,” Journal of the American Medical Informatics Association, vol. 25, no. 3, pp. 331-336, 2017. DOI: 10.1093/jamia/ocx132.</mixed-citation></ref><ref id="B4"><label>4.</label><mixed-citation>M. Swain and J. Cole, “ChemDataExtractor: a toolkit for automated extraction of chemical information from the scientific literature,” Journal of Chemical Information and Modeling, vol. 56, no. 10, pp. 1894-1904, 2016. DOI: 10.17863/CAM.10935.</mixed-citation></ref><ref id="B5"><label>5.</label><mixed-citation>J. Lee, W. Yoon, S. Kim, D. Kim, S. Kim, C. So, and J. Kang, “BioBERT: a pre-trained biomedical language representation model for biomedical text mining,” Bioinformatics (Oxford, England), vol. 36, no. 4, pp. 1234- 1240, 2019. DOI: 10.1093/bioinformatics/btz682.</mixed-citation></ref><ref id="B6"><label>6.</label><mixed-citation>A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. Gomez, L. Kaiser, and I. Polosukhin, “Attention is all you need,” Advances in Neural Information Processing Systems, vol. 30, pp. 5998-6008, 2017.</mixed-citation></ref><ref id="B7"><label>7.</label><mixed-citation>J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, “BERT: pretraining of deep bidirectional transformers for language understanding,” Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1, pp. 4171-4186, 2018. DOI: 10.18653/v1/N19-1423.</mixed-citation></ref><ref id="B8"><label>8.</label><mixed-citation>The OpenNLP Project, http://opennlp.apache.org, Accessed: 202303-07.</mixed-citation></ref><ref id="B9"><label>9.</label><mixed-citation>CRFsuite: a Fast Implementation of Conditional Random Fields (CRFs), http://www.chokkan.org/software/crfsuite/, Accessed: 2023-0307.</mixed-citation></ref><ref id="B10"><label>10.</label><mixed-citation>J. M. Bernard, “Handling of Markush Structures,” Journal of chemical information and computer sciences, vol. 31, no. 1, pp. 64-68, 1991. DOI: 10.1021/ci00001a010.</mixed-citation></ref><ref id="B11"><label>11.</label><mixed-citation>S. Heller, A. McNaught, I. Pletnev, S. Stein, and D. Tchekhovskoi, “The IUPAC International Chemical Identifier,” Journal of Cheminformatics, vol. 7, pp. 1-34, 2015. DOI: 10.1186/s13321-015-0068-4.</mixed-citation></ref><ref id="B12"><label>12.</label><mixed-citation>USPTO, https://www.uspto.gov/patents, Accessed: 2023-03-07.</mixed-citation></ref><ref id="B13"><label>13.</label><mixed-citation>T. Mikolov, G. Corrado, K. Chen, and J. Dean, “Efficient estimation of word representations in vector space,” Proceedings of Workshop at ICLR, pp. 1-12, 2013.</mixed-citation></ref><ref id="B14"><label>14.</label><mixed-citation>T. Mikolov, W.-T. Yih, and G. Zweig, “Linguistic regularities in continuous space word representations,” Proceedings of NAACL-HLT, pp. 746- 751, 2013.</mixed-citation></ref><ref id="B15"><label>15.</label><mixed-citation>C. Cortes and V. Vapnik, “Support-vector networks,” Machine Learning, vol. 20, no. 03, pp. 273-297, 1995. DOI: 10.1007/BF00994018.</mixed-citation></ref><ref id="B16"><label>16.</label><mixed-citation>J. R. Finkel, T. Grenager, and C. Manning, “Incorporating non-local information into information extraction systems by Gibbs sampling,” Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics (ACL 2005), pp. 363-370, 2005. DOI: 10.3115/ 1219840.1219885.</mixed-citation></ref><ref id="B17"><label>17.</label><mixed-citation>T. M. Mitchell, Machine learning. McGraw-Hill New York, 1997, 432 pp.</mixed-citation></ref></ref-list></back></article>
