@article {10.3844/jcssp.2025.1037.1048, article_type = {journal}, title = {Research on Deep Neural Network for Afaan-Oromo Language Text-to-Speech Synthesis}, author = {Rundasa, Diriba Gichile and Ramu, Arulmurugan and Adugna, Teshale Debushe and Teshome, Chala Sembeta and Tasew, Desalegn}, volume = {21}, number = {5}, year = {2025}, month = {Apr}, pages = {1037-1048}, doi = {10.3844/jcssp.2025.1037.1048}, url = {https://thescipub.com/abstract/jcssp.2025.1037.1048}, abstract = {Text-to-speech synthesis is the automatic translation of unlimited natural language sentences from Text to spoken form that closely mimics the spoken form of the same Text by a native speaker of the language. The purpose of a Texttext-to-speech synthesizer is to generate comprehensible, natural signalling human voice from text transcriptions. Despite the wide range of potential applications for Text-to-speech systems, the field is language-dependent, with most efforts concentrated on accessible languages, especially English. The linguistic resources required to make a speech from texts are lacking for under-resourced languages like the Afaan-Oromo language. To develop an Afaan Oromo language text-to-speech synthesizer, a speech dataset was prepared, which is 10644 text and audio pairs in numbers and assembled from dependable sources. After that, the proposed model is developed, which incorporates nonstandard terminology, including acronyms, currencies and numerals, in addition to common terms and names. The deep neural network was selected for this study because it has a good ability to convert Text into complex spoken Text. A number of experiments were carried out to find the best-performing model. To assess the performance of the model objectively, the attention mistake is used where, whereas to assess the models' performance subjectively, the Mean Opinion Score or scale (MOS) test is used. Subsequently, the objective outcomes evaluation revealed that Deep Voice (DV) 3 produced 18 of the 248 words in the evaluation sentence set. At the same time, Tacotron-2(two) made attention errors, which are two in number. Moreover, MOS scores for naturalness and intelligibility have made 4.36 and 4.33 out of five (5) for Tacotron-2 (two), respectively and 3.32 and 3.04 for Deep Voice(DV) 3, respectively. Because it can translate intricate verbal information into auditory feature parameters, the deep neural network was selected for this research. Therefore, the Tacotron-2 (two) model yielded good results and promising results compared with Deep Voice (DV) 3, making it suitable for a range of applications, such as smart education, different telephone inquiry services, and recommendation systems, which are the most common areas of the system.}, journal = {Journal of Computer Science}, publisher = {Science Publications} }