2022
Gustavo Aguilar Shuguang Chen, Anirudh Srinivasan
CALCS 2021 Shared Task: Machine Translation for Code-Switched Data Journal Article
In: arXiv preprint arXiv:2202.09625, 2022.
Abstract | Links | BibTeX | Tags: Code-Switching
@article{nokey,
title = {CALCS 2021 Shared Task: Machine Translation for Code-Switched Data},
author = {Shuguang Chen, Gustavo Aguilar, Anirudh Srinivasan, Mona Diab, Thamar Solorio},
url = {https://arxiv.org/pdf/2202.09625.pdf},
year = {2022},
date = {2022-02-19},
urldate = {2022-02-19},
journal = {arXiv preprint arXiv:2202.09625},
abstract = {To date, efforts in the code-switching literature have focused for the most part on language identification, POS, NER, and syntactic parsing. In this paper, we address machine translation for code-switched social media data. We create a community shared task. We provide two modalities for participation: supervised and unsupervised. For the supervised setting, participants are challenged to translate English into Hindi-English (Eng-Hinglish) in a single direction. For the unsupervised setting, we provide the following language pairs: English and Spanish-English (Eng-Spanglish), and English and Modern Standard Arabic-Egyptian Arabic (Eng-MSAEA) in both directions. We share insights and challenges in curating the "into" code-switching language evaluation data. Further, we provide baselines for all language pairs in the shared task. The leaderboard for the shared task comprises 12 individual system submissions corresponding to 5 different teams. The best performance achieved is 12.67% BLEU score for English to Hinglish and 25.72% BLEU score for MSAEA to English.},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {article}
}
2021
Parikh, Dwija; Solorio, Thamar
Normalization and Back-Transliteration for Code-Switched Data Inproceedings
In: Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching, pp. 119–124, ACL, 2021.
Links | BibTeX | Tags: Code-Switching
@inproceedings{parikh-solorio-2021-normalization,
title = {Normalization and Back-Transliteration for Code-Switched Data},
author = {Dwija Parikh and Thamar Solorio},
url = {https://aclanthology.org/2021.calcs-1.15},
doi = {10.18653/v1/2021.calcs-1.15},
year = {2021},
date = {2021-06-11},
booktitle = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
pages = {119--124},
publisher = {ACL},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {inproceedings}
}
Shuguang Chen Thamar Solorio, Alan W Black
Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching Conference
Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching, 2021.
Abstract | Links | BibTeX | Tags: Code-Switching
@conference{-Switching,
title = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
author = {Thamar Solorio, Shuguang Chen, Alan W Black, Mona Diab, Sunayana Sitaram, Victor Soto, Emre Yilmaz, Anirudh Srinivasan},
url = {https://aclanthology.org/2021.calcs-1.0.pdf},
year = {2021},
date = {2021-06-01},
urldate = {2021-06-01},
booktitle = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
abstract = {Bienvenidos to the proceedings of the fifth edition of the workshop on computational approaches for linguistic code-switching (CALCS-2021)! Code-switching is this very interesting phenomenon where multilingual speakers communicate by moving back and forth between the languages they speak when communicating with other multilingual speakers. Code-switching (CSW) is predominantly used in speech but since it also tends to be more prevalent in casual settings, we can observe CSW in genres like social media platforms where interactions tend to be more casual.
However interesting, our current NLP technology is lagging behind in the development of resources and methodologies that can effectively process code-switched language. This is true for even the large multilingual pretrained models such as mBERT and BART. At the same time, the growing adoption of smart devices and automated assistants that rely on speech interfaces, makes it even more pressing that our field addresses CSW language data},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {conference}
}
However interesting, our current NLP technology is lagging behind in the development of resources and methodologies that can effectively process code-switched language. This is true for even the large multilingual pretrained models such as mBERT and BART. At the same time, the growing adoption of smart devices and automated assistants that rely on speech interfaces, makes it even more pressing that our field addresses CSW language data
2020
Aguilar, Gustavo; Solorio, Thamar
From English to Code-Switching: Transfer Learning with Strong Morphological Clues Conference
The 58th Annual Meeting of the Association for Computational Linguistics, ACL, 2020.
Abstract | Links | BibTeX | Tags: Code-Switching, Transfer learning
@conference{aguilar20_cs-elmo,
title = {From English to Code-Switching: Transfer Learning with Strong Morphological Clues},
author = {Gustavo Aguilar and Thamar Solorio},
editor = {ACL},
url = {https://www.aclweb.org/anthology/2020.acl-main.716.pdf},
year = {2020},
date = {2020-06-19},
booktitle = {The 58th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
abstract = {Linguistic Code-switching (CS) is still an understudied phenomenon in natural language processing. The NLP community has mostly focused on monolingual and multi-lingual scenarios, but little attention has been given to CS in particular. This is partly because of the lack of resources and annotated data, despite its increasing occurrence in social media platforms. In this paper, we aim at adapting monolingual models to code-switched text in various tasks. Specifically, we transfer English knowledge from a pre-trained ELMo model to different code-switched language pairs (i.e., Nepali-English, Spanish-English, and Hindi-English) using the task of language identification. Our method, CS-ELMo, is an extension of ELMo with a simple yet effective position-aware attention mechanism inside its character convolutions. We show the effectiveness of this transfer learning step by outperforming multilingual BERT and homologous CS-unaware ELMo models and establishing a new state of the art in CS tasks, such as NER and POS tagging. Our technique can be expanded to more English-paired code-switched languages, providing more resources to the CS community.},
keywords = {Code-Switching, Transfer learning},
pubstate = {published},
tppubtype = {conference}
}
Aguilar, Gustavo; Kar, Sudipta; Solorio, Thamar
LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark Conference
Proceedings of the Twelfth International Conference on Language Resources and Evaluation, LREC, 2020.
Abstract | Links | BibTeX | Tags: benchmark, Code-Switching
@conference{aguilar20_lince,
title = {LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark},
author = {Gustavo Aguilar and Sudipta Kar and Thamar Solorio},
editor = {LREC},
url = {https://www.aclweb.org/anthology/2020.lrec-1.223.pdf},
year = {2020},
date = {2020-05-11},
booktitle = {Proceedings of the Twelfth International Conference on Language Resources and Evaluation},
publisher = {LREC},
abstract = {Recent trends in NLP research have raised an interest in linguistic code-switching (CS); modern approaches have been proposed to solve a wide range of NLP tasks on multiple language pairs. Unfortunately, these proposed methods are hardly generalizable to different code-switched languages. In addition, it is unclear whether a model architecture is applicable for a different task while still being compatible with the code-switching setting. This is mainly because of the lack of a centralized benchmark and the sparse corpora that researchers employ based on their specific needs and interests. To facilitate research in this direction, we propose a centralized benchmark for textbf{Lin}guistic textbf{C}ode-switching textbf{E}valuation (textbf{LinCE}) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis). As part of the benchmark centralization effort, we provide an online platform at texttt{ritual.uh.edu/lince}, where researchers can submit their results while comparing with others in real-time. In addition, we provide the scores of different popular models, including LSTM, ELMo, and multilingual BERT so that the NLP community can compare against state-of-the-art systems. LinCE is a continuous effort, and we will expand it with more low-resource languages and tasks.},
keywords = {benchmark, Code-Switching},
pubstate = {published},
tppubtype = {conference}
}
Patwa, Parth; Aguilar, Gustavo; Kar, Sudipta; Pandey, Suraj; PYKL, Srinivas; Gambäck, Björn; Chakraborty, Tanmoy; Solorio, Thamar; Das, Amitava
SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets Inproceedings
In: Proceedings of the Fourteenth Workshop on Semantic Evaluation, pp. 774–790, International Committee for Computational Linguistics, Barcelona (online), 2020.
Abstract | Links | BibTeX | Tags: Code-Switching, Sentiment analysis
@inproceedings{patwa-etal-2020-semeval,
title = {SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets},
author = { Parth Patwa and Gustavo Aguilar and Sudipta Kar and Suraj Pandey and Srinivas PYKL and Björn Gambäck and Tanmoy Chakraborty and Thamar Solorio and Amitava Das},
url = {https://www.aclweb.org/anthology/2020.semeval-1.100},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation},
pages = {774--790},
publisher = {International Committee for Computational Linguistics},
address = {Barcelona (online)},
abstract = {In this paper, we present the results of the SemEval-2020 Task 9 on Sentiment Analysis of Code-Mixed Tweets (SentiMix 2020). We also release and describe our Hinglish (Hindi-English)and Spanglish (Spanish-English) corpora annotated with word-level language identification and sentence-level sentiment labels. These corpora are comprised of 20K and 19K examples, respectively. The sentiment labels are - Positive, Negative, and Neutral. SentiMix attracted 89 submissions in total including 61 teams that participated in the Hinglish contest and 28 submitted systems to the Spanglish competition. The best performance achieved was 75.0% F1 score for Hinglish and 80.6% F1 for Spanglish. We observe that BERT-like models and ensemble methods are the most common and successful approaches among the participants.},
keywords = {Code-Switching, Sentiment analysis},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Suraj Maharjan Deepthi Mave,; Solorio, Thamar
Language Identification and Analysis of Code-Switched Social Media Text Workshop
Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018, Association for Computational Linguistics, Melbourne, Australia, 2018.
Abstract | Links | BibTeX | Tags: Code-Switching
@workshop{dmave2018,
title = {Language Identification and Analysis of Code-Switched Social Media Text},
author = {Deepthi Mave, Suraj Maharjan, and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/W18-3206},
year = {2018},
date = {2018-07-19},
booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018},
publisher = {Association for Computational Linguistics},
address = {Melbourne, Australia},
abstract = {In this paper, we detail our work on comparing different word-level language identification systems for code-switched Hindi-English data and a standard Spanish-English dataset. In this regard, we build a new code-switched dataset for Hindi-English. To understand the code-switching patterns in these language pairs, we investigate different code-switching metrics. We find that the CRF model outperforms the neural network based models by a margin of 2-5 percentage points for Spanish-English and 3-5 percentage points for Hindi-English. },
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {workshop}
}
Aguilar, Gustavo; AlGhamdi, Fahad; Soto, Victor; Diab, Mona; Hirschberg, Julia; Solorio, Thamar
Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task Inproceedings
In: for Computational Linguistics, Association (Ed.): Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, Association for Computational Linguistics, Melbourne, Australia, 2018.
Abstract | Links | BibTeX | Tags: Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media
@inproceedings{aguilar@calcs2018,
title = {Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task},
author = {Gustavo Aguilar and Fahad AlGhamdi and Victor Soto and Mona Diab and Julia Hirschberg and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/W18-3219},
year = {2018},
date = {2018-07-15},
booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching},
publisher = {Association for Computational Linguistics},
address = {Melbourne, Australia},
abstract = {In the third shared task of the Computational Approaches to Linguistic CodeSwitching (CALCS) workshop, we focus on Named Entity Recognition (NER) on code-switched social-media data. We divide the shared task into two competitions based on the English-Spanish (ENG-SPA) and Modern Standard Arabic-Egyptian (MSA-EGY) language pairs. We use Twitter data and 9 entity types to establish a new dataset for code-switched NER benchmarks. In addition to the CS phenomenon, the diversity of the entities and the social media challenges make the task considerably hard to process. As a result, the best scores of the competitions are 63.76% and 71.61% for ENG-SPA and MSA-EGY, respectively. We present the scores of 9 participants and discuss the most common challenges among submissions.},
keywords = {Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
Molina, Giovanni; Rey-Villamizar, Nicolas; Solorio, Thamar; AlGhamdi, Fahad; Ghoneim, Mahmoud; Hawwari, Abdelati; Diab, Mona
Overview for the second shared task on language identification in code-switched data Proceeding
Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP 2016, 2016.
Links | BibTeX | Tags: Code-Switching, shared task
@proceedings{Molina2016,
title = {Overview for the second shared task on language identification in code-switched data},
author = {Giovanni Molina and Nicolas Rey-Villamizar and Thamar Solorio and Fahad AlGhamdi and Mahmoud Ghoneim and Abdelati Hawwari and Mona Diab},
url = {http://www.aclweb.org/anthology/W/W16/W16-5805.pdf},
year = {2016},
date = {2016-11-01},
pages = {40-49},
publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP 2016},
keywords = {Code-Switching, shared task},
pubstate = {published},
tppubtype = {proceedings}
}
AlGhamdi, Fahad; Molina, Giovanni; Diab, Mona; Solorio, Thamar; Hawwari, Abdelati; Soto, Victor; Hirschberg, Julia
Part of Speech Tagging for Code Switched Data Proceeding
Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP, 2016.
Links | BibTeX | Tags: Code-Switching
@proceedings{AlGhamdi2016,
title = {Part of Speech Tagging for Code Switched Data},
author = {Fahad AlGhamdi and Giovanni Molina and Mona Diab and Thamar Solorio and Abdelati Hawwari and Victor Soto and Julia Hirschberg},
url = {http://www.aclweb.org/anthology/W/W16/W16-5812.pdf},
year = {2016},
date = {2016-11-01},
pages = {98-107},
publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {proceedings}
}
Samih, Younes; Maharjan, Suraj; Attia, Mohammed; Kallmeyer, Laura; Solorio, Thamar
Multilingual Code-switching Identification via LSTM Recurrent Neural Networks Proceeding
Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP, 2016.
Links | BibTeX | Tags: Code-Switching, CRF, Deeplearning, Neural Networks
@proceedings{Samih2016,
title = {Multilingual Code-switching Identification via LSTM Recurrent Neural Networks},
author = {Younes Samih and Suraj Maharjan and Mohammed Attia and Laura Kallmeyer and Thamar Solorio},
url = {http://www.aclweb.org/anthology/W/W16/W16-5806.pdf},
year = {2016},
date = {2016-10-31},
publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP},
keywords = {Code-Switching, CRF, Deeplearning, Neural Networks},
pubstate = {published},
tppubtype = {proceedings}
}
2015
Maharjan, Suraj; Blair, Elizabeth; Bethard, Steven; Solorio, Thamar
Developing Language-tagged Corpora for Code-switching Tweets Inproceedings
In: Proceedings of The 9th Linguistic Annotation Workshop, pp. 72–84, Association for Computational Linguistics, Denver, Colorado, USA, 2015.
Links | BibTeX | Tags: Code-Switching
@inproceedings{maharjan-EtAl:2015:LAW,
title = {Developing Language-tagged Corpora for Code-switching Tweets},
author = {Suraj Maharjan and Elizabeth Blair and Steven Bethard and Thamar Solorio},
url = {http://www.aclweb.org/anthology/W15-1608},
year = {2015},
date = {2015-06-05},
booktitle = {Proceedings of The 9th Linguistic Annotation Workshop},
pages = {72--84},
publisher = {Association for Computational Linguistics},
address = {Denver, Colorado, USA},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {inproceedings}
}
2014
Solorio, Thamar; Blair, Elizabeth; Maharjan, Suraj; Bethard, Steven; Diab, Mona; Gohneim, Mahmoud; Hawwari, Abdelati; AlGhamdi, Fahad; Hirschberg, Julia; Chang, Alison; Fung, Pascale
Overview for the First Shared Task on Language Identification in Code-Switched Data Conference
Proceedings of The First Workshop on Computational Approaches to Code Switching, held in conjunction with EMNLP 2014., ACL, Doha, Qatar, 2014.
Links | BibTeX | Tags: Code-Switching
@conference{SolorioEtAl:14,
title = {Overview for the First Shared Task on Language Identification in Code-Switched Data},
author = {Thamar Solorio and Elizabeth Blair and Suraj Maharjan and Steven Bethard and Mona Diab and Mahmoud Gohneim and Abdelati Hawwari and Fahad AlGhamdi and Julia Hirschberg and Alison Chang and Pascale Fung},
url = {http://www.aclweb.org/anthology/W/W14/W14-3907.pdf},
year = {2014},
date = {2014-10-25},
booktitle = {Proceedings of The First Workshop on Computational Approaches to Code Switching, held in conjunction with EMNLP 2014.},
pages = {62--72},
publisher = {ACL},
address = {Doha, Qatar},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {conference}
}