Publications
2024
- ACLDIALECTBENCH: An NLP Benchmark for Dialects, Varieties, and Closely-Related LanguagesFaisal, Fahim, Ahia, Orevaoghene, Srivastava, Aarohi, Ahuja, Kabir, Chiang, David, Tsvetkov, Yulia, and Anastasopoulos, AntoniosIn Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) Aug 2024
Language technologies should be judged on their usefulness in real-world use cases. An often overlooked aspect in natural language processing (NLP) research and evaluation is language variation in the form of non-standard dialects or language varieties (hereafter, varieties). Most NLP benchmarks are limited to standard language varieties. To fill this gap, we propose DIALECTBENCH, the first-ever large-scale benchmark for NLP on varieties, which aggregates an extensive set of task-varied varieties datasets (10 text-level tasks covering 281 varieties). This allows for a comprehensive evaluation of NLP system performance on different varieties. We provide substantial proof of performance disparities between standard and non-standard language varieties, and we also identify language clusters with larger performance divergence across tasks.We believe DIALECTBENCH provides a comprehensive view of the current state of NLP for varieties and one step towards advancing it further.
@inproceedings{faisal-etal-2024-dialectbench, title = {{DIALECTBENCH}: An {NLP} Benchmark for Dialects, Varieties, and Closely-Related Languages}, author = {Faisal, Fahim and Ahia, Orevaoghene and Srivastava, Aarohi and Ahuja, Kabir and Chiang, David and Tsvetkov, Yulia and Anastasopoulos, Antonios}, editor = {Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek}, booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, month = aug, year = {2024}, address = {Bangkok, Thailand}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2024.acl-long.777}, doi = {10.18653/v1/2024.acl-long.777}, pages = {14412--14454}, }
- arxivData-Augmentation-Based Dialectal Adaptation for LLMsFaisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024) Jun 2024
This report presents gmnlp’s participation to the Dialect-Copa shared task at VarDial 2024 (Chifu et al., 2024), which focuses on evaluating the commonsense reasoning capabilities of large language models (LLMs) on South Slavic micro-dialects. The task aims to assess how well LLMs can handle non-standard dialectal varieties, as their performance on standard languages is already well-established. We propose an approach that combines the strengths of different types of language models and leverages data augmentation techniques to improve task performance on three South Slavic dialects: Chakavian, Cherkano, and Torlak. We conduct experiments using a language-family-focused encoder-based model (BERTić) and a domain-agnostic multilingual model (AYA-101). Our results demonstrate that the proposed data augmentation techniques lead to substantial performance gains across all three test datasets in the open-source model category. This work highlights the practical utility of data augmentation and the potential of LLMs in handling non-standard dialectal varieties, contributing to the broader goal of advancing natural language understanding in low-resource and dialectal settings.
@inproceedings{faisal-anastasopoulos-2024-data, title = {Data-Augmentation-Based Dialectal Adaptation for {LLM}s}, author = {Faisal, Fahim and Anastasopoulos, Antonios}, editor = {Scherrer, Yves and Jauhiainen, Tommi and Ljube{\v{s}}i{\'c}, Nikola and Zampieri, Marcos and Nakov, Preslav and Tiedemann, J{\"o}rg}, booktitle = {Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)}, month = jun, year = {2024}, address = {Mexico City, Mexico}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2024.vardial-1.17}, doi = {10.18653/v1/2024.vardial-1.17}, pages = {197--208}, }
- MRLAn Efficient Approach for Studying Cross-Lingual Transfer in Multilingual Language ModelsFaisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024) Nov 2024
The capacity and effectiveness of pre-trained multilingual models (MLMs) for zero-shot cross-lingual transfer is well established. However, phenomena of positive or negative transfer, and the effect of language choice still need to be fully understood, especially in the complex setting of massively multilingual LMs. We propose an \textitefficient method to study transfer language influence in zero-shot performance on another target language. Unlike previous work, our approach \textitdisentangles downstream tasks from language, using dedicated adapter units. Our findings suggest that some languages do not largely affect others, while some languages, especially ones unseen during pre-training, can be extremely beneficial or detrimental for different target languages. We find that no transfer language is beneficial for all target languages. We do, curiously, observe languages previously unseen by MLMs consistently benefit from transfer from \textitalmost any language. We additionally use our modular approach to quantify negative interference efficiently and categorize languages accordingly. Furthermore, we provide a list of promising transfer-target language configurations that consistently lead to target language performance improvements.
@inproceedings{faisal-anastasopoulos-2024-efficient, title = {An Efficient Approach for Studying Cross-Lingual Transfer in Multilingual Language Models}, author = {Faisal, Fahim and Anastasopoulos, Antonios}, editor = {S{\"a}lev{\"a}, Jonne and Owodunni, Abraham}, booktitle = {Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)}, month = nov, year = {2024}, address = {Miami, Florida, USA}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2024.mrl-1.4}, pages = {45--92}, }
2023
- SemEvalGMNLP at SemEval-2023 Tasks 12: Sentiment Analysis with Phylogeny-Based AdaptersAlam, Md Mahfuz, Xie, Ruoyu, Faisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023) Jul 2023
@inproceedings{alam-etal-23-semeval, title = {GMNLP at SemEval-2023 Tasks 12: Sentiment Analysis with Phylogeny-Based Adapters}, url = {}, author = {ibn Alam, Md Mahfuz and Xie, Ruoyu and Faisal, Fahim and Anastasopoulos, Antonios}, year = {2023}, booktitle = {Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023)}, address = {Toronto, Canada}, publisher = {Association for Computational Linguistics}, month = jul, }
- MRLTo token or not to token: A Comparative Study of Text Representations for Cross-Lingual TransferRahman, Md Mushfiqur, Sakib, Fardin Ahsan, Faisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the 3rd Workshop on Multi-lingual Representation Learning (MRL) Dec 2023
@inproceedings{rahman-etal-2023-token, title = {To token or not to token: A Comparative Study of Text Representations for Cross-Lingual Transfer}, author = {Rahman, Md Mushfiqur and Sakib, Fardin Ahsan and Faisal, Fahim and Anastasopoulos, Antonios}, editor = {Ataman, Duygu}, booktitle = {Proceedings of the 3rd Workshop on Multi-lingual Representation Learning (MRL)}, month = dec, year = {2023}, address = {Singapore}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2023.mrl-1.6}, doi = {10.18653/v1/2023.mrl-1.6}, pages = {67--84}, }
- MRLGeographic and Geopolitical Biases of Language ModelsFaisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the 3rd Workshop on Multi-lingual Representation Learning (MRL) Dec 2023
@inproceedings{faisal-anastasopoulos-2023-geographic, title = {Geographic and Geopolitical Biases of Language Models}, author = {Faisal, Fahim and Anastasopoulos, Antonios}, editor = {Ataman, Duygu}, booktitle = {Proceedings of the 3rd Workshop on Multi-lingual Representation Learning (MRL)}, month = dec, year = {2023}, address = {Singapore}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2023.mrl-1.12}, doi = {10.18653/v1/2023.mrl-1.12}, pages = {139--163}, }
2022
- ACLDataset Geography: Mapping Language Data to Language UsersFaisal, Fahim, Wang, Yinkai, and Anastasopoulos, AntoniosIn Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) May 2022
As language technologies become more ubiquitous, there are increasing efforts towards expanding the language diversity and coverage of natural language processing (NLP) systems. Arguably, the most important factor influencing the quality of modern NLP systems is data availability. In this work, we study the geographical representativeness of NLP datasets, aiming to quantify if and by how much do NLP datasets match the expected needs of the language speakers. In doing so, we use entity recognition and linking systems, also making important observations about their cross-lingual consistency and giving suggestions for more robust evaluation. Last, we explore some geographical and economic factors that may explain the observed dataset distributions.
@inproceedings{faisal-etal-2022-dataset, title = {Dataset Geography: Mapping Language Data to Language Users}, author = {Faisal, Fahim and Wang, Yinkai and Anastasopoulos, Antonios}, booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, month = may, year = {2022}, address = {Dublin, Ireland}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2022.acl-long.239}, doi = {10.18653/v1/2022.acl-long.239}, pages = {3381--3411}, }
- AACLPhylogeny-Inspired Adaptation of Multilingual Models to New LanguagesFaisal, Fahim, and Anastasopoulos, AntoniosAccepted for publication in AACL 2022 May 2022
@article{phylogeny, author = {Faisal, Fahim and Anastasopoulos, Antonios}, keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Phylogeny-Inspired Adaptation of Multilingual Models to New Languages}, journal = {Accepted for publication in AACL 2022}, publisher = {arXiv}, year = {2022}, copyright = {Creative Commons Attribution 4.0 International}, }
2021
- NLP4ProgCode to Comment Translation: A Comparative Study on Model Effectiveness & ErrorsMahmud, Junayed, Faisal, Fahim, Arnob, Raihan Islam, Anastasopoulos, Antonios, and Moran, KevinIn Proceedings of the 1st Workshop on Natural Language Processing for Programming (NLP4Prog 2021) Aug 2021
Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to “translate” code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts.
title = {Code to Comment Translation: A Comparative Study on Model Effectiveness {\&} Errors}, author = {Mahmud, Junayed and Faisal, Fahim and Arnob, Raihan Islam and Anastasopoulos, Antonios and Moran, Kevin}, booktitle = {Proceedings of the 1st Workshop on Natural Language Processing for Programming (NLP4Prog 2021)}, month = aug, year = {2021}, address = {Online}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2021.nlp4prog-1.1}, doi = {10.18653/v1/2021.nlp4prog-1.1}, pages = {1--16}, }
- EMNLPSD-QA: Spoken Dialectal Question Answering for the Real WorldFaisal, Fahim, Keshava, Sharlina, Alam, Md Mahfuz Ibn, and Anastasopoulos, AntoniosIn Findings of the Association for Computational Linguistics: EMNLP 2021 Nov 2021
Question answering (QA) systems are now available through numerous commercial applications for a wide variety of domains, serving millions of users that interact with them via speech interfaces. However, current benchmarks in QA research do not account for the errors that speech recognition models might introduce, nor do they consider the language variations (dialects) of the users. To address this gap, we augment an existing QA dataset to construct a multi-dialect, spoken QA benchmark on five languages (Arabic, Bengali, English, Kiswahili, Korean) with more than 68k audio prompts in 24 dialects from 255 speakers. We provide baseline results showcasing the real-world performance of QA systems and analyze the effect of language variety and other sensitive speaker attributes on downstream performance. Last, we study the fairness of the ASR and QA models with respect to the underlying user populations.
@inproceedings{faisal-etal-2021-sd-qa, title = {{SD}-{QA}: Spoken Dialectal Question Answering for the Real World}, author = {Faisal, Fahim and Keshava, Sharlina and Alam, Md Mahfuz Ibn and Anastasopoulos, Antonios}, booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021}, month = nov, year = {2021}, address = {Punta Cana, Dominican Republic}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2021.findings-emnlp.281}, pages = {3296--3315}, }
- MRQAInvestigating Post-pretraining Representation Alignment for Cross-Lingual Question AnsweringFaisal, Fahim, and Anastasopoulos, AntoniosIn Proceedings of the 3rd Workshop on Machine Reading for Question Answering Nov 2021
Human knowledge is collectively encoded in the roughly 6500 languages spoken around the world, but it is not distributed equally across languages. Hence, for information-seeking question answering (QA) systems to adequately serve speakers of all languages, they need to operate cross-lingually. In this work we investigate the capabilities of multilingually pretrained language models on cross-lingual QA. We find that explicitly aligning the representations across languages with a post-hoc finetuning step generally leads to improved performance. We additionally investigate the effect of data size as well as the language choice in this fine-tuning step, also releasing a dataset for evaluating cross-lingual QA systems.
@inproceedings{faisal-anastasopoulos-2021-investigating, title = {Investigating Post-pretraining Representation Alignment for Cross-Lingual Question Answering}, author = {Faisal, Fahim and Anastasopoulos, Antonios}, booktitle = {Proceedings of the 3rd Workshop on Machine Reading for Question Answering}, month = nov, year = {2021}, address = {Punta Cana, Dominican Republic}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2021.mrqa-1.14}, pages = {133--148}, }
2020
- JOIMining Temporal Evolution of Knowledge Graphs and Genealogical Features for Literature-based Discovery PredictionChoudhury, Nazim, Faisal, Fahim, and Khushi, MatloobJournal of Informetrics Nov 2020
Literature-based discovery process identifies the important but implicit relations among information embedded in published literature. Existing techniques from Information Retrieval (IR) and Natural Language Processing (NLP) attempt to identify the hidden or unpublished connections between information concepts within published literature, however, these techniques overlooked the concept of predicting the future and emerging relations among scientific knowledge components such as author selected keywords encapsulated within the literature. Keyword Co-occurrence Network (KCN), built upon author selected keywords, is considered as a knowledge graph that focuses both on these knowledge components and knowledge structure of a scientific domain by examining the relationships between knowledge entities. Using data from two multidisciplinary research domains other than the bio-medical domain, and capitalizing on bibliometrics, the dynamicity of temporal KCNs, and a recurrent neural network, this study develops some novel features supportive for the prediction of the future literature-based discoveries - the emerging connections (co-appearances in the same article) among keywords. Temporal importance extracted from both bipartite and unipartite networks, communities defined by genealogical relations, and the relative importance of temporal citation counts were used in the feature construction process. Both node and edge-level features were input into a recurrent neural network to forecast the feature values and predict the future relations between different scientific concepts/topics represented by the author selected keywords. High performance rates, compared both against contemporary heterogeneous network-based method and preferential attachment process, suggest that these features complement both the prediction of future literature-based discoveries and emerging trend analysis.
@article{CHOUDHURY2020101057, title = {Mining Temporal Evolution of Knowledge Graphs and Genealogical Features for Literature-based Discovery Prediction}, journal = {Journal of Informetrics}, volume = {14}, number = {3}, pages = {101057}, year = {2020}, issn = {1751-1577}, doi = {https://doi.org/10.1016/j.joi.2020.101057}, url = {https://www.sciencedirect.com/science/article/pii/S1751157719304468}, author = {Choudhury, Nazim and Faisal, Fahim and Khushi, Matloob}, keywords = {Literature-based Knowledge Discovery, Dynamic Supervised Link Prediction, Keyword Co-occurrence Network (KCN), Genealogical Community, Weighted Temporal Citation}, }
2019
- ICAEEA Framework For Disease Identification From Unstructured Data Using Text Classification And Disease Knowledge BaseFaisal, Fahim, Bhuiyan, Shafkat Ahmed, Ashraf, Faisal Bin, and Kamal, Abu Raihan MostofaIn 2019 5th International Conference on Advances in Electrical Engineering (ICAEE) Nov 2019
@inproceedings{8975447, author = {Faisal, Fahim and Bhuiyan, Shafkat Ahmed and Ashraf, Faisal Bin and Kamal, Abu Raihan Mostofa}, booktitle = {2019 5th International Conference on Advances in Electrical Engineering (ICAEE)}, title = {A Framework For Disease Identification From Unstructured Data Using Text Classification And Disease Knowledge Base}, year = {2019}, volume = {}, number = {}, pages = {547-554}, doi = {10.1109/ICAEE48663.2019.8975447}, }