<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE root>
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.2" xml:lang="en"><front><journal-meta><journal-id journal-id-type="publisher-id">Russian Journal of Linguistics</journal-id><journal-title-group><journal-title xml:lang="en">Russian Journal of Linguistics</journal-title><trans-title-group xml:lang="ru"><trans-title>Russian Journal of Linguistics</trans-title></trans-title-group></journal-title-group><issn publication-format="print">2687-0088</issn><issn publication-format="electronic">2686-8024</issn><publisher><publisher-name xml:lang="en">Peoples’ Friendship University of Russia named after Patrice Lumumba (RUDN University)</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">31332</article-id><article-id pub-id-type="doi">10.22363/2687-0088-30132</article-id><article-categories><subj-group subj-group-type="toc-heading" xml:lang="en"><subject>Articles</subject></subj-group><subj-group subj-group-type="toc-heading" xml:lang="ru"><subject>Статьи</subject></subj-group><subj-group subj-group-type="toc-heading" xml:lang="zh"><subject>Articles</subject></subj-group><subj-group subj-group-type="article-type"><subject>Research Article</subject></subj-group></article-categories><title-group><article-title xml:lang="en">Text complexity and linguistic features: Their correlation in English and Russian</article-title><trans-title-group xml:lang="ru"><trans-title>Сложность текста и лингвистические признаки: как они соотносятся в русском и английском языках</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-4464-1355</contrib-id><name-alternatives><name xml:lang="en"><surname>Morozov</surname><given-names>Dmitry A.</given-names></name><name xml:lang="ru"><surname>Морозов</surname><given-names>Дмитрий Алексеевич</given-names></name></name-alternatives><bio xml:lang="en"><p>Junior Researcher at the Laboratory of Applied Digital Technologies, International Mathematical Center</p></bio><bio xml:lang="ru"><p>младший научный сотрудник Лаборатории прикладных цифровых технологий Международного математического центра</p></bio><email>morozowdm@gmail.com</email><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-8409-6457</contrib-id><name-alternatives><name xml:lang="en"><surname>Glazkova</surname><given-names>Anna V.</given-names></name><name xml:lang="ru"><surname>Глазкова</surname><given-names>Анна Валерьевна</given-names></name></name-alternatives><bio xml:lang="en"><p>Doctor of Sc. (Technology), Associate Professor of the Department of Software at the Institute of Mathematics and Computer Science</p></bio><bio xml:lang="ru"><p>кандидат технических наук, доцент кафедры программного обеспечения Института математики и компьютерных наук</p></bio><email>a.v.glazkova@utmn.ru</email><xref ref-type="aff" rid="aff2"/></contrib><contrib contrib-type="author"><contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-1767-5480</contrib-id><name-alternatives><name xml:lang="en"><surname>Iomdin</surname><given-names>Boris L.</given-names></name><name xml:lang="ru"><surname>Иомдин</surname><given-names>Борис Леонидович</given-names></name></name-alternatives><bio xml:lang="en"><p>holds a Ph.D. in Philology and is a Leading Researcher</p></bio><bio xml:lang="ru"><p>кандидат филологических наук, ведущий научный сотрудник</p></bio><email>iomdin@ruslang.ru</email><xref ref-type="aff" rid="aff3"/></contrib></contrib-group><aff-alternatives id="aff1"><aff><institution xml:lang="en">Novosibirsk State University</institution></aff><aff><institution xml:lang="ru">Новосибирский государственный университет</institution></aff></aff-alternatives><aff-alternatives id="aff2"><aff><institution xml:lang="en">University of Tyumen</institution></aff><aff><institution xml:lang="ru">Тюменский государственный университет</institution></aff></aff-alternatives><aff-alternatives id="aff3"><aff><institution xml:lang="en">Vinogradov Russian Language Institute</institution></aff><aff><institution xml:lang="ru">Институт русского языка им. В. В. Виноградова РАН</institution></aff></aff-alternatives><pub-date date-type="pub" iso-8601-date="2022-06-29" publication-format="electronic"><day>29</day><month>06</month><year>2022</year></pub-date><volume>26</volume><issue>2</issue><issue-title xml:lang="en">Computational Linguistics and Discourse Complexology</issue-title><issue-title xml:lang="ru">Компьютерная лингвистика и дискурсивная комплексология</issue-title><fpage>426</fpage><lpage>448</lpage><history><date date-type="received" iso-8601-date="2022-06-29"><day>29</day><month>06</month><year>2022</year></date></history><permissions><copyright-statement xml:lang="en">Copyright ©; 2022, Morozov D.A., Glazkova A.V., Iomdin B.L.</copyright-statement><copyright-statement xml:lang="ru">Copyright ©; 2022, Морозов Д.А., Глазкова А.В., Иомдин Б.Л.</copyright-statement><copyright-statement xml:lang="zh">Copyright ©; 2022, Morozov D., Glazkova A., Iomdin B.</copyright-statement><copyright-year>2022</copyright-year><copyright-holder xml:lang="en">Morozov D.A., Glazkova A.V., Iomdin B.L.</copyright-holder><copyright-holder xml:lang="ru">Морозов Д.А., Глазкова А.В., Иомдин Б.Л.</copyright-holder><copyright-holder xml:lang="zh">Morozov D., Glazkova A., Iomdin B.</copyright-holder><ali:free_to_read xmlns:ali="http://www.niso.org/schemas/ali/1.0/"/><license><ali:license_ref xmlns:ali="http://www.niso.org/schemas/ali/1.0/">https://creativecommons.org/licenses/by-nc/4.0</ali:license_ref></license></permissions><self-uri xlink:href="https://journals.rudn.ru/linguistics/article/view/31332">https://journals.rudn.ru/linguistics/article/view/31332</self-uri><abstract xml:lang="en"><p style="text-align: justify;">Text complexity assessment is a challenging task requiring various linguistic aspects to be taken into consideration. The complexity level of the text should correspond to the reader’s competence. A too complicated text could be incomprehensible, whereas a too simple one could be boring. For many years, simple features were used to assess readability, e.g. average length of words and sentences or vocabulary variety. Thanks to the development of natural language processing methods, the set of text parameters used for evaluating readability has expanded significantly. In recent years, many articles have been published the authors of which investigated the contribution of various lexical, morphological, and syntactic features to the readability level. Nevertheless, as the methods and corpora are quite diverse, it may be hard to draw general conclusions as to the effectiveness of linguistic information for evaluating text complexity due to the diversity of methods and corpora. Moreover, a cross-lingual impact of different features on various datasets has not been investigated. The purpose of this study is to conduct a large-scale comparison of features of different nature. We experimentally assessed seven commonly used feature types (readability, traditional features, morphological features, punctuation, syntax frequency, and topic modeling) on six corpora for text complexity assessment in English and Russian employing four common machine learning models: logistic regression, random forest, convolutional neural network and feedforward neural network. One of the corpora, the corpus of fiction literature read by Russian school students, was constructed for the experiment using a large-scale survey to ensure the objectivity of the labeling. We showed which feature types can significantly improve the performance and analyzed their impact according to the dataset characteristics, language, and data source.</p></abstract><trans-abstract xml:lang="ru"><p style="text-align: justify;">Автоматическая оценка читабельности текста - актуальная и непростая задача, которая требует учёта разнообразных лингвистических факторов. Сложность текста должна соответствовать уровню читателя: слишком сложный текст останется непонятым, слишком простой будет скучным. Исторически для оценки читабельности использовались простые характеристики: средняя длина слов и предложений, разнообразие лексики. Благодаря развитию методов обработки естественного языка набор используемых для оценки параметров текста существенно расширился. За последние годы было опубликовано множество работ, в которых исследовался вклад в сложность текста различных лексических, морфологических, синтаксических признаков. Тем не менее, поскольку использованные методы и корпусы довольно разнообразны, затруднительно делать общие выводы об эффективности различных лингвистических характеристик текста. Более того, не было проведено сравнение влияния признаков для различных языков. Целью настоящего исследования является проведение масштабного сравнения признаков различного характера. Мы экспериментально сравнили семь часто используемых типов признаков (индексы читабельности, традиционные, морфологические, синтаксические, пунктуационные, частотные признаки и тематическое моделирование) на материале трёх русскоязычных и трёх англоязычных корпусов, с использованием четырех распространённых алгоритмов машинного обучения: логистической регрессии, случайного леса, свёрточной нейронной сети и нейронной сети с прямой связью. Один из корпусов - корпус художественной литературы, читаемой российскими школьниками, - был создан для этого эксперимента с помощью масштабного опроса для обеспечения объективности разметки. Мы показали, какие типы признаков могут значительно повысить качество прогнозирования, и проанализировали их влияние в зависимости от характеристик корпуса, его языка и источника текстов.</p></trans-abstract><kwd-group xml:lang="en"><kwd>text complexity</kwd><kwd>machine learning</kwd><kwd>neural network</kwd><kwd>corpus linguistics</kwd></kwd-group><kwd-group xml:lang="ru"><kwd>сложность текста</kwd><kwd>машинное обучение</kwd><kwd>нейронные сети</kwd><kwd>корпусная лингвистика</kwd></kwd-group><funding-group><funding-statement xml:lang="en">The article was funded by RFBR, project number 19-29-14224.</funding-statement></funding-group></article-meta></front><body></body><back><ref-list><ref id="B1"><label>1.</label><mixed-citation>Blei, David M., Andrew Y. Ng &amp; Michael I. Jordan. 2003. Latent dirichlet allocation. The Journal of Machine Learning Research 3. 993-1022. https://doi.org/10.1016/B978-0-12-411519-4.00006-9</mixed-citation></ref><ref id="B2"><label>2.</label><mixed-citation>Burtsev, Mikhail, Alexander Seliverstov, Rafael Airapetyan, Mikhail Arkhipov, Dilyara Baymurzina, Nickolay Bushkov, Olga Gureenkova, Taras Khakhulin, Yuri Kuratov, Denis Kuznetsov, Alexey Litinsky, Varvara Logacheva, Alexey Lymar, Valentin Malykh, Maxim Petrov, Vadim Polulyakh, Leonid Pugachev, Alexey Sorokin, Maria Vikhreva &amp; Marat Zaynutdinov. 2018. DeepPavlov: Open-source library for dialogue systems. In Proceedings of ACL 2018, System Demonstrations. 122-127. https://doi.org/10.18653/v1/P18-4021</mixed-citation></ref><ref id="B3"><label>3.</label><mixed-citation>Cantos, Pascual &amp; Ángela Almela. 2019. Readability indices for the assessment of textbooks: A feasibility study in the context of EFL. Vigo International Journal of Applied Linguistics 16. 31-52. https://doi.org/10.35869/VIAL.V0I16.92</mixed-citation></ref><ref id="B4"><label>4.</label><mixed-citation>Chollet, Francois. 2015. Keras. Github. https://github.com/fchollet/keras (accessed 31.01.2022).</mixed-citation></ref><ref id="B5"><label>5.</label><mixed-citation>Coleman, Meri &amp; Ta Lin Liau. 1975. A computer readability formula designed for machine scoring. Journal of Applied Psychology 60(2). 283.</mixed-citation></ref><ref id="B6"><label>6.</label><mixed-citation>Dale, Edgar &amp; Jeanne S. Chall. 1948. A formula for predicting readability: Instructions. Educational Research Bulletin 27. 11-20, 37-54.</mixed-citation></ref><ref id="B7"><label>7.</label><mixed-citation>Devlin, Jacob, Ming-Wei Chang, Kenton Lee &amp; Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 4171-4186. Minneapolis, Minnesota. Association for Computational Linguistics. https://doi.org/10.18653/v1/N19-1423</mixed-citation></ref><ref id="B8"><label>8.</label><mixed-citation>Deutsch, Tovly, Masoud Jasbi &amp; Stuart Shieber. 2020. Linguistic Features for Readability Assessment. In Proceedings of the Fifteenth Workshop on Innovative Use of NLP for Building Educational Applications. Association for Computational Linguistics. 1-17. https://doi.org/10.18653/v1/2020.bea-1.1</mixed-citation></ref><ref id="B9"><label>9.</label><mixed-citation>Feng, Lijun, Martin Jansche, Matt Huenerfauth &amp; Noémie Elhadad. 2010. A comparison of features for automatic readability assessment. In Coling 2010: Posters. 276-284.</mixed-citation></ref><ref id="B10"><label>10.</label><mixed-citation>Glazkova, Anna, Yury Egorov &amp; Maksim Glazkov. 2021. A comparative study of feature types for age-based text classification. Analysis of Images, Social Networks and Texts. 120-134. Cham. Springer International Publishing. https://doi.org/10.1007/978-3-030-72610-2_9</mixed-citation></ref><ref id="B11"><label>11.</label><mixed-citation>Honnibal, Matthew &amp; Ines Montani. 2017. spaCy 2:Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing. To appear.</mixed-citation></ref><ref id="B12"><label>12.</label><mixed-citation>Iomdin, Boris L. &amp; Dmitry A. Morozov. 2021. Who Can Understand “Dunno”? Automatic Assessment of Text Complexity in Children’s Literature. Russian Speech 5. 55-68. https://doi.org/10.31857/S013161170017239-1</mixed-citation></ref><ref id="B13"><label>13.</label><mixed-citation>Isaeva, Ulyana &amp; Alexey Sorokin. 2020. Investigating the robustness of reading difficulty models for Russian educational texts. In AIST 2020: Recent Trends in Analysis of Images, Social Networks and Texts. 65-77. https://doi.org/10.1007/978-3-030-71214-3_6</mixed-citation></ref><ref id="B14"><label>14.</label><mixed-citation>Ivanov, Vladimir, Marina Solnyshkina &amp; Valery Solovyev. 2018. Efficiency of text readability features in Russian academic texts, In Komp'yuternaya Lingvistika I Intellektual'nye Tehnologii. 284-293.</mixed-citation></ref><ref id="B15"><label>15.</label><mixed-citation>Kincaid, J. Peter, Robert P. Fishburne Jr., Richard L. Rogers &amp; Brad S. Chissom. 1975. Derivation of New Readability Formulas (Automated Readability Index, Fog Count and Flesch Reading Ease Formula) for Navy Enlisted Personnel. Naval Technical Training Command Millington TN Research Branch. https://doi.org/10.21236/ada006655</mixed-citation></ref><ref id="B16"><label>16.</label><mixed-citation>Kingma, Diederik P. &amp; Jimmy Ba. 2015. Adam: A method for stochastic optimization. ICLR.</mixed-citation></ref><ref id="B17"><label>17.</label><mixed-citation>Korobov, Mikhail. 2015. Morphological analyzer and generator for Russian and Ukrainian languages. In International Conference on Analysis of Images, Social Networks and Texts. 320-332. Springer. https://doi.org/10.1007/978-3-319-26123-2_31</mixed-citation></ref><ref id="B18"><label>18.</label><mixed-citation>Kuratov, Yuri &amp; Mikhail Arkhipov. 2019. Adaptation of deep bidirectional multilingual transformers for Russian language. Komp’uuternaya Lingvistika i Intellektual’nye Tehnologii. 333-339.</mixed-citation></ref><ref id="B19"><label>19.</label><mixed-citation>Kutuzov, Andrey &amp; Elizaveta Kuzmenko. 2016. Web-vectors: A toolkit for building web interfaces for vector semantic models. In International Conference on Analysis of Images, Social Networks and Texts. 155-161. Springer. https://doi.org/10.1007/978-3-319-52920-2 15</mixed-citation></ref><ref id="B20"><label>20.</label><mixed-citation>Leech, Geoffrey, Paul Rayson &amp; Andrew Wilson. 2001. Word Frequencies in Written and Spoken English: Based on the British National Corpus. Routledge.</mixed-citation></ref><ref id="B21"><label>21.</label><mixed-citation>Loper, Edward &amp; Steven Bird. 2002. NLTK: The natural language toolkit. In Proceedings of the ACL-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics. 63-70.</mixed-citation></ref><ref id="B22"><label>22.</label><mixed-citation>Loshchilov, Ilya &amp; Frank Hutter. 2018. Decoupled weight decay regularization. In International Conference on Learning Representations.</mixed-citation></ref><ref id="B23"><label>23.</label><mixed-citation>Lyashevskaya, Olga &amp; Serge Sharoff. 2009. The Frequency Dictionary of the Modern Russian Language (Based on the Materials of the Russian National Corpus). Moscow: Azbukovnik.</mixed-citation></ref><ref id="B24"><label>24.</label><mixed-citation>Martinc, Matej, Senja Pollak &amp; Marko Robnik-Sikonja. 2021. Supervised and unsupervised neural approaches to text readability. Computational Linguistics 47. 1-39. https://doi.org/10.1162/coli_a_00398</mixed-citation></ref><ref id="B25"><label>25.</label><mixed-citation>McLaughlin, G. Harry. 1969. Smog grading - a new readability formula. Journal of reading 12(8). 639-646.</mixed-citation></ref><ref id="B26"><label>26.</label><mixed-citation>Mikolov, Tomas, Edouard Grave, Piotr Bojanowski, Christian Puhrsch &amp; Armand Joulin. 2018. Advances in pre-training distributed word representations. In Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018).</mixed-citation></ref><ref id="B27"><label>27.</label><mixed-citation>Pedregosa, Fabian, Gael Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, Peter Prettenhofer, Ron Weiss, Vincent Dubourg, Jake Vanderplas, Alexandre Passos, David Cournapeau, Matthieu Brucher, Matthieu Perrot &amp; Edouard Duchesnay. 2011. Scikit-learn: Machine learning in Python. The Journal of Machine Learning Research 12. 2825-2830.</mixed-citation></ref><ref id="B28"><label>28.</label><mixed-citation>Rehurek, Radim &amp; Petr Sojka. 2010. Software framework for topic modelling with large corpora. In Proceedings of the LREC 2010 Workshop on New Challenges for NLP Frameworks.</mixed-citation></ref><ref id="B29"><label>29.</label><mixed-citation>Reimers, Nils &amp; Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing. 3982-3992. Association for Computational Linguistics. https://doi.org/10.18653/v1/D19-1410</mixed-citation></ref><ref id="B30"><label>30.</label><mixed-citation>Reimers, Nils &amp; Iryna Gurevych. 2020. Making monolingual sentence embeddings multilingual using knowledge distillation. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing. 4512-4525. Association for Computational Linguistics. https://doi.org/10.18653/v1/2020.emnlp-main.365</mixed-citation></ref><ref id="B31"><label>31.</label><mixed-citation>Senter, R. J. &amp; E. A. Smith. 1967. Automated readability index. AMRL-TR. Aerospace Medical Research Laboratories. 1-14.</mixed-citation></ref><ref id="B32"><label>32.</label><mixed-citation>Solnyshkina, Marina, Vladimir Ivanov &amp; Valery Solovyev. 2018. Readability formula for Russian texts: A modified version. In Mexican International Conference on Artificial Intelligence. 132-145. Springer. https://doi.org/10.1007/978-3-030-04497-8_11</mixed-citation></ref><ref id="B33"><label>33.</label><mixed-citation>Templin, Mildred C. 1957. Certain Language Skills in Children; Their Development and Interrelationships. Minneapolis: University of Minnesota Press.</mixed-citation></ref><ref id="B34"><label>34.</label><mixed-citation>Vajjala, Sowmya &amp; Ivana Lucic. 2018. OneStopEnglish corpus: A new corpus for automatic readability assessment and text simplification. In Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications. 297-304. Association for Computational Linguistics. https://doi.org/10.18653/v1/W18-0535</mixed-citation></ref><ref id="B35"><label>35.</label><mixed-citation>Wolf, Thomas, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Remi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest &amp; Alexander Rush. 2020. Transformers: State-of-the-art natural language processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. 38-45. https://doi.org/10.18653/v1/2020.emnlp-demos.6</mixed-citation></ref><ref id="B36"><label>36.</label><mixed-citation>Xun, Guangxu, Vishrawas Gopalakrishnan, Fenglong Ma, Yaliang Li, Jing Gao &amp; Aidong Zhang. 2016. Topic discovery for short texts using word embeddings. In 2016 IEEE 16th International Conference on Data Mining (ICDM). 1299-1304. IEEE.</mixed-citation></ref><ref id="B37"><label>37.</label><mixed-citation>Yan, Xiaohui, Jiafeng Guo, Yanyan Lan &amp; Xueqi Cheng. 2013. A biterm topic model for short texts. In Proceedings of the 22nd International Conference on World Wide Web. 1445-1456. https://doi.org/10.1145/2488388.2488514</mixed-citation></ref><ref id="B38"><label>38.</label><mixed-citation>Chapter 699a. Readable language in insurance policies. URL: https://www.cga.ct.gov/current/pub/chap_699a.htm#sec_38a-29 (accessed 29.05.2022).</mixed-citation></ref><ref id="B39"><label>39.</label><mixed-citation>Readability. 2021. URL: https://github.com/morozowdmitry/readability (accessed 29.05.2022).</mixed-citation></ref><ref id="B40"><label>40.</label><mixed-citation>Readability 0.3.1. 2019. URL: https://pypi.org/project/readability/ (accessed 29.05.2022).</mixed-citation></ref></ref-list></back></article>
