@article {10.3844/jcssp.2026.1539.1551, article_type = {journal}, title = {MedFusion: A Unified Multimodal Framework for Visual Question Answering and Explainable Medical Recommendation}, author = {Mahapatra, Satyajit and Mishra, Jibitesh and Patra, Kumar Janardan and Dash, Sanjit Kumar and Deferisha, Aliazar Deneke}, volume = {22}, number = {5}, year = {2026}, month = {May}, pages = {1539-1551}, doi = {10.3844/jcssp.2026.1539.1551}, url = {https://thescipub.com/abstract/jcssp.2026.1539.1551}, abstract = {In clinical decision-making, the ability to ask visual questions about medical images and receive accurate, personalized, and interpretable recommendations can significantly enhance practitioner support systems. This paper presents MedFusion, a unified multimodal framework that integrates Visual Question Answering (VQA), personalized medical recommendation, and explainability within a single architecture. The proposed model employs co-attention–based visual–textual fusion augmented with retrieval-enhanced reasoning to improve answer grounding, while personalized recommendations are generated using a shared multimodal representation supported by GAN-guided feature augmentation. To enhance transparency, the framework provides attention-based heatmaps and natural-language rationales for both answers and recommendations. Extensive experiments on VQA-RAD, EHRXQA, and Med-RecX demonstrate that MedFusion outperforms state-of-the-art medical VQA and recommendation baselines, achieving a 7.4% improvement in VQA accuracy, reducing RMSE to 0.91, and improving human-rated interpretability to 4.5/5. Ablation studies confirm the effectiveness of retrieval augmentation, GAN-guided enhancement, and joint multi-task learning. These results indicate that MedFusion offers a robust and explainable decision-support solution, advancing the deployment of trustworthy, user-adaptive AI systems in real-world healthcare environments.}, journal = {Journal of Computer Science}, publisher = {Science Publications} }