@article {10.3844/jcssp.2026.1539.1551,
article_type = {journal},
title = {MedFusion: A Unified Multimodal Framework for Visual Question Answering and Explainable Medical Recommendation},
author = {Mahapatra, Satyajit and Mishra, Jibitesh and Patra, Kumar Janardan and Dash, Sanjit Kumar and Deferisha, Aliazar Deneke},
volume = {22},
number = {5},
year = {2026},
month = {May},
pages = {1539-1551},
doi = {10.3844/jcssp.2026.1539.1551},
url = {https://thescipub.com/abstract/jcssp.2026.1539.1551},
abstract = {In clinical decision-making, the ability to ask visual questions about medical images and receive accurate, personalized, and interpretable recommendations can significantly enhance practitioner support systems. This paper presents MedFusion, a unified multimodal framework that integrates Visual Question Answering (VQA), personalized medical recommendation, and explainability within a single architecture. The proposed model employs co-attention–based visual–textual fusion augmented with retrieval-enhanced reasoning to improve answer grounding, while personalized recommendations are generated using a shared multimodal representation supported by GAN-guided feature augmentation. To enhance transparency, the framework provides attention-based heatmaps and natural-language rationales for both answers and recommendations. Extensive experiments on VQA-RAD, EHRXQA, and Med-RecX demonstrate that MedFusion outperforms state-of-the-art medical VQA and recommendation baselines, achieving a 7.4% improvement in VQA accuracy, reducing RMSE to 0.91, and improving human-rated interpretability to 4.5/5. Ablation studies confirm the effectiveness of retrieval augmentation, GAN-guided enhancement, and joint multi-task learning. These results indicate that MedFusion offers a robust and explainable decision-support solution, advancing the deployment of trustworthy, user-adaptive AI systems in real-world healthcare environments.},
journal = {Journal of Computer Science},
publisher = {Science Publications}
}