2020 |
Shirani, Amirreza; Dernoncourt, Franck; Lipka, Nedim; Asente, Paul; Echevarria, Jose; Solorio, Thamar SemEval-2020 Task 10: Emphasis Selection for Written Text in Visual Media Conference Proceedings of the Fourteenth Workshop on Semantic Evaluation, International Committee for Computational Linguistics, Barcelona (online), 2020. Abstract | Links | BibTeX | Tags: Emphasis Selection, SemEval @conference{shirani-etal-2020-semeval, title = {SemEval-2020 Task 10: Emphasis Selection for Written Text in Visual Media}, author = {Amirreza Shirani and Franck Dernoncourt and Nedim Lipka and Paul Asente and Jose Echevarria and Thamar Solorio}, url = {https://www.aclweb.org/anthology/2020.semeval-1.184}, year = {2020}, date = {2020-12-03}, booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation}, publisher = {International Committee for Computational Linguistics}, address = {Barcelona (online)}, abstract = {In this paper, we present the main findings and compare the results of SemEval-2020 Task 10, Emphasis Selection for Written Text in Visual Media. The goal of this shared task is to design automatic methods for emphasis selection, i.e. choosing candidates for emphasis in textual content to enable automated design assistance in authoring. The main focus is on short text instances for social media, with a variety of examples, from social media posts to inspirational quotes. Participants were asked to model emphasis using plain text with no additional context from the user or other design considerations. SemEval-2020 Emphasis Selection shared task attracted 197 participants in the early phase and a total of 31 teams made submissions to this task. The highest-ranked submission achieved 0.823 Matchm score. The analysis of systems submitted to the task indicates that BERT and RoBERTa were the most common choice of pre-trained models used, and part of speech tag (POS) was the most useful feature. Full results can be found on the task's website.}, keywords = {Emphasis Selection, SemEval}, pubstate = {published}, tppubtype = {conference} } In this paper, we present the main findings and compare the results of SemEval-2020 Task 10, Emphasis Selection for Written Text in Visual Media. The goal of this shared task is to design automatic methods for emphasis selection, i.e. choosing candidates for emphasis in textual content to enable automated design assistance in authoring. The main focus is on short text instances for social media, with a variety of examples, from social media posts to inspirational quotes. Participants were asked to model emphasis using plain text with no additional context from the user or other design considerations. SemEval-2020 Emphasis Selection shared task attracted 197 participants in the early phase and a total of 31 teams made submissions to this task. The highest-ranked submission achieved 0.823 Matchm score. The analysis of systems submitted to the task indicates that BERT and RoBERTa were the most common choice of pre-trained models used, and part of speech tag (POS) was the most useful feature. Full results can be found on the task's website. |
Kar, Sudipta; Aguilar, Gustavo; Lapata, Mirella; Solorio, Thamar Multi-view Story Characterization from Movie Plot Synopses and Reviews Conference Forthcoming EMNLP 2020, Forthcoming. BibTeX | Tags: Narrative Analysis, Text Classification @conference{Kar2020, title = {Multi-view Story Characterization from Movie Plot Synopses and Reviews}, author = {Sudipta Kar and Gustavo Aguilar and Mirella Lapata and Thamar Solorio}, year = {2020}, date = {2020-11-16}, booktitle = {EMNLP 2020}, keywords = {Narrative Analysis, Text Classification}, pubstate = {forthcoming}, tppubtype = {conference} } |
Shirani, Amirreza; Dernoncourt, Franck; Echevarria, Jose; Asente, Paul; Lipka, Nedim; Solorio, Thamar Let Me Choose: From Verbal Context to Font Selection Conference Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020), Association for Computational Linguistics, 2020. Abstract | Links | BibTeX | Tags: Font, font selection, From Verbal Context to Font Selection @conference{shirani-etal-2020-choose, title = {Let Me Choose: From Verbal Context to Font Selection}, author = {Amirreza Shirani and Franck Dernoncourt and Jose Echevarria and Paul Asente and Nedim Lipka and Thamar Solorio}, editor = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020)}, url = {https://www.aclweb.org/anthology/2020.acl-main.762.pdf}, doi = {10.18653/v1/2020.acl-main.762}, year = {2020}, date = {2020-07-19}, booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020)}, publisher = {Association for Computational Linguistics}, abstract = {In this paper, we aim to learn associations between visual attributes of fonts and the verbal context of the texts they are typically applied to. Compared to related work leveraging the surrounding visual context, we choose to focus only on the input text as this can enable new applications for which the text is the only visual element in the document. We introduce a new dataset, containing examples of different topics in social media posts and ads, labeled through crowd-sourcing. Due to the subjective nature of the task, multiple fonts might be perceived as acceptable for an input text, which makes this problem challenging. To this end, we investigate different end-to-end models to learn label distributions on crowd-sourced data and capture inter-subjectivity across all annotations.}, keywords = {Font, font selection, From Verbal Context to Font Selection}, pubstate = {published}, tppubtype = {conference} } In this paper, we aim to learn associations between visual attributes of fonts and the verbal context of the texts they are typically applied to. Compared to related work leveraging the surrounding visual context, we choose to focus only on the input text as this can enable new applications for which the text is the only visual element in the document. We introduce a new dataset, containing examples of different topics in social media posts and ads, labeled through crowd-sourcing. Due to the subjective nature of the task, multiple fonts might be perceived as acceptable for an input text, which makes this problem challenging. To this end, we investigate different end-to-end models to learn label distributions on crowd-sourced data and capture inter-subjectivity across all annotations. |
Aguilar, Gustavo; Solorio, Thamar From English to Code-Switching: Transfer Learning with Strong Morphological Clues Conference The 58th Annual Meeting of the Association for Computational Linguistics, ACL, 2020. Abstract | Links | BibTeX | Tags: Code-Switching, Transfer learning @conference{aguilar20_cs-elmo, title = {From English to Code-Switching: Transfer Learning with Strong Morphological Clues}, author = {Gustavo Aguilar and Thamar Solorio}, editor = {ACL}, url = {https://www.aclweb.org/anthology/2020.acl-main.716.pdf}, year = {2020}, date = {2020-06-19}, booktitle = {The 58th Annual Meeting of the Association for Computational Linguistics}, publisher = {ACL}, abstract = {Linguistic Code-switching (CS) is still an understudied phenomenon in natural language processing. The NLP community has mostly focused on monolingual and multi-lingual scenarios, but little attention has been given to CS in particular. This is partly because of the lack of resources and annotated data, despite its increasing occurrence in social media platforms. In this paper, we aim at adapting monolingual models to code-switched text in various tasks. Specifically, we transfer English knowledge from a pre-trained ELMo model to different code-switched language pairs (i.e., Nepali-English, Spanish-English, and Hindi-English) using the task of language identification. Our method, CS-ELMo, is an extension of ELMo with a simple yet effective position-aware attention mechanism inside its character convolutions. We show the effectiveness of this transfer learning step by outperforming multilingual BERT and homologous CS-unaware ELMo models and establishing a new state of the art in CS tasks, such as NER and POS tagging. Our technique can be expanded to more English-paired code-switched languages, providing more resources to the CS community.}, keywords = {Code-Switching, Transfer learning}, pubstate = {published}, tppubtype = {conference} } Linguistic Code-switching (CS) is still an understudied phenomenon in natural language processing. The NLP community has mostly focused on monolingual and multi-lingual scenarios, but little attention has been given to CS in particular. This is partly because of the lack of resources and annotated data, despite its increasing occurrence in social media platforms. In this paper, we aim at adapting monolingual models to code-switched text in various tasks. Specifically, we transfer English knowledge from a pre-trained ELMo model to different code-switched language pairs (i.e., Nepali-English, Spanish-English, and Hindi-English) using the task of language identification. Our method, CS-ELMo, is an extension of ELMo with a simple yet effective position-aware attention mechanism inside its character convolutions. We show the effectiveness of this transfer learning step by outperforming multilingual BERT and homologous CS-unaware ELMo models and establishing a new state of the art in CS tasks, such as NER and POS tagging. Our technique can be expanded to more English-paired code-switched languages, providing more resources to the CS community. |
Aguilar, Gustavo; Kar, Sudipta; Solorio, Thamar LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark Conference Proceedings of the Twelfth International Conference on Language Resources and Evaluation, LREC, 2020. Abstract | Links | BibTeX | Tags: benchmark, Code-Switching @conference{aguilar20_lince, title = {LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark}, author = {Gustavo Aguilar and Sudipta Kar and Thamar Solorio}, editor = {LREC}, url = {https://www.aclweb.org/anthology/2020.lrec-1.223.pdf}, year = {2020}, date = {2020-05-11}, booktitle = {Proceedings of the Twelfth International Conference on Language Resources and Evaluation}, publisher = {LREC}, abstract = {Recent trends in NLP research have raised an interest in linguistic code-switching (CS); modern approaches have been proposed to solve a wide range of NLP tasks on multiple language pairs. Unfortunately, these proposed methods are hardly generalizable to different code-switched languages. In addition, it is unclear whether a model architecture is applicable for a different task while still being compatible with the code-switching setting. This is mainly because of the lack of a centralized benchmark and the sparse corpora that researchers employ based on their specific needs and interests. To facilitate research in this direction, we propose a centralized benchmark for textbf{Lin}guistic textbf{C}ode-switching textbf{E}valuation (textbf{LinCE}) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis). As part of the benchmark centralization effort, we provide an online platform at texttt{ritual.uh.edu/lince}, where researchers can submit their results while comparing with others in real-time. In addition, we provide the scores of different popular models, including LSTM, ELMo, and multilingual BERT so that the NLP community can compare against state-of-the-art systems. LinCE is a continuous effort, and we will expand it with more low-resource languages and tasks.}, keywords = {benchmark, Code-Switching}, pubstate = {published}, tppubtype = {conference} } Recent trends in NLP research have raised an interest in linguistic code-switching (CS); modern approaches have been proposed to solve a wide range of NLP tasks on multiple language pairs. Unfortunately, these proposed methods are hardly generalizable to different code-switched languages. In addition, it is unclear whether a model architecture is applicable for a different task while still being compatible with the code-switching setting. This is mainly because of the lack of a centralized benchmark and the sparse corpora that researchers employ based on their specific needs and interests. To facilitate research in this direction, we propose a centralized benchmark for textbf{Lin}guistic textbf{C}ode-switching textbf{E}valuation (textbf{LinCE}) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis). As part of the benchmark centralization effort, we provide an online platform at texttt{ritual.uh.edu/lince}, where researchers can submit their results while comparing with others in real-time. In addition, we provide the scores of different popular models, including LSTM, ELMo, and multilingual BERT so that the NLP community can compare against state-of-the-art systems. LinCE is a continuous effort, and we will expand it with more low-resource languages and tasks. |
Shafaei, Mahsa; Samghabadi, Niloofar Safi; Kar, Sudipta; Solorio, Thamar Age Suitability Rating: Predicting the MPAA Rating Based on Movie Dialogues Proceeding LREC, 2020. Abstract | Links | BibTeX | Tags: MPAA Rating, Story Analysis, Text Classification @proceedings{Shafaei2020, title = {Age Suitability Rating: Predicting the MPAA Rating Based on Movie Dialogues}, author = {Mahsa Shafaei and Niloofar Safi Samghabadi and Sudipta Kar and Thamar Solorio }, url = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.166.pdf}, year = {2020}, date = {2020-05-01}, publisher = {LREC}, abstract = {Movies help us learn and inspire societal change. But they can also contain objectionable content that negatively affects viewers' behavior, especially children. In this paper, our goal is to predict the suitability of movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We create a corpus for movie MPAA ratings and propose an RNN-based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 81% weighted F1-score for the classification model that outperforms the traditional machine learning method by 7%.}, keywords = {MPAA Rating, Story Analysis, Text Classification}, pubstate = {published}, tppubtype = {proceedings} } Movies help us learn and inspire societal change. But they can also contain objectionable content that negatively affects viewers' behavior, especially children. In this paper, our goal is to predict the suitability of movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We create a corpus for movie MPAA ratings and propose an RNN-based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 81% weighted F1-score for the classification model that outperforms the traditional machine learning method by 7%. |
Aguilar, Gustavo; Ling, Yuan; Zhang, Yu; Yao, Benjamin; Fan, Xing; Guo, Chenlei Knowledge Distillation from Internal Representations Conference The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI, 2020. Abstract | Links | BibTeX | Tags: @conference{aguilar20_kd, title = {Knowledge Distillation from Internal Representations}, author = {Gustavo Aguilar and Yuan Ling and Yu Zhang and Benjamin Yao and Xing Fan and Chenlei Guo}, editor = {AAAI}, url = {https://arxiv.org/pdf/1910.03723.pdf}, year = {2020}, date = {2020-02-07}, booktitle = {The Thirty-Fourth AAAI Conference on Artificial Intelligence}, publisher = {AAAI}, abstract = {Knowledge distillation is typically conducted by training a small model (the student) to mimic a large and cumbersome model (the teacher). The idea is to compress the knowledge from the teacher by using its output probabilities as soft-labels to optimize the student. However, when the teacher is considerably large, there is no guarantee that the internal knowledge of the teacher will be transferred into the student; even if the student closely matches the soft-labels, its internal representations may be considerably different. This internal mismatch can undermine the generalization capabilities originally intended to be transferred from the teacher to the student. In this paper, we propose to distill the internal representations of a large model such as BERT into a simplified version of it. We formulate two ways to distill such representations and various algorithms to conduct the distillation. We experiment with datasets from the GLUE benchmark and consistently show that adding knowledge distillation from internal representations is a more powerful method than only using soft-label distillation.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Knowledge distillation is typically conducted by training a small model (the student) to mimic a large and cumbersome model (the teacher). The idea is to compress the knowledge from the teacher by using its output probabilities as soft-labels to optimize the student. However, when the teacher is considerably large, there is no guarantee that the internal knowledge of the teacher will be transferred into the student; even if the student closely matches the soft-labels, its internal representations may be considerably different. This internal mismatch can undermine the generalization capabilities originally intended to be transferred from the teacher to the student. In this paper, we propose to distill the internal representations of a large model such as BERT into a simplified version of it. We formulate two ways to distill such representations and various algorithms to conduct the distillation. We experiment with datasets from the GLUE benchmark and consistently show that adding knowledge distillation from internal representations is a more powerful method than only using soft-label distillation. |
López-Monroy, Pasto A; A.González, Fabio; Solorio, Thamar Early author profiling on Twitter using profile features with multi-resolution Journal Article Forthcoming Expert Systems with Applications, 140 , Forthcoming, ISBN: 0957-4174. Abstract | Links | BibTeX | Tags: Profiling @article{PastorEtAl:20, title = {Early author profiling on Twitter using profile features with multi-resolution}, author = {A. Pasto López-Monroy and Fabio A.González and Thamar Solorio}, url = {http://www.sciencedirect.com/science/article/pii/S095741741930627X}, doi = {https://doi.org/10.1016/j.eswa.2019.112909}, isbn = {0957-4174}, year = {2020}, date = {2020-02-01}, journal = {Expert Systems with Applications}, volume = {140}, abstract = {The Author Profiling (AP) task aims to predict demographic characteristics about the authors from documents (e.g., age, gender, native language). The research so far has focused only on forensic scenarios by performing post-analysis using all the available text evidence. This paper introduces the task of Early Author Profiling (EAP) in Twitter. The goal is to effectively recognize profiles using as few tweets as possible from the user history. The task is highly relevant to support social media analysis and different problems related to security and marketing, where prevention and anticipation is crucial. This work proposes a novel strategy that combines a state of the art representation for early text classification and specialized word-vectors for author profiling tasks. In this strategy we build prototypical features called Profile based Meta-Words, which allow us to model AP information at different levels of granularity. Our evaluation shows that the proposed methodology is well suited for profiling little text evidence (e.g., a handful of tweets) in early stages, but as more tweets become available other granularities better encode larger amounts of text in late stages. We evaluated the proposed ideas on gender and language variety identification for English and Spanish, and showed that the proposal outperforms state of the art methodologies.}, keywords = {Profiling}, pubstate = {forthcoming}, tppubtype = {article} } The Author Profiling (AP) task aims to predict demographic characteristics about the authors from documents (e.g., age, gender, native language). The research so far has focused only on forensic scenarios by performing post-analysis using all the available text evidence. This paper introduces the task of Early Author Profiling (EAP) in Twitter. The goal is to effectively recognize profiles using as few tweets as possible from the user history. The task is highly relevant to support social media analysis and different problems related to security and marketing, where prevention and anticipation is crucial. This work proposes a novel strategy that combines a state of the art representation for early text classification and specialized word-vectors for author profiling tasks. In this strategy we build prototypical features called Profile based Meta-Words, which allow us to model AP information at different levels of granularity. Our evaluation shows that the proposed methodology is well suited for profiling little text evidence (e.g., a handful of tweets) in early stages, but as more tweets become available other granularities better encode larger amounts of text in late stages. We evaluated the proposed ideas on gender and language variety identification for English and Spanish, and showed that the proposal outperforms state of the art methodologies. |
Arevalo, John; Solorio, Thamar; Montes-y-Gomez, Manuel; Gonzalez, Fabio Gated multimodal networks Journal Article Neural Computing and Applications, 2020, ISSN: 1433-3058. Abstract | Links | BibTeX | Tags: @article{Arevalo2020, title = {Gated multimodal networks}, author = {John Arevalo and Thamar Solorio and Manuel Montes-y-Gomez and Fabio Gonzalez }, url = {https://doi.org/10.1007/s00521-019-04559-1}, doi = {10.1007/s00521-019-04559-1}, issn = {1433-3058}, year = {2020}, date = {2020-01-15}, journal = {Neural Computing and Applications}, abstract = {This paper considers the problem of leveraging multiple sources of information or data modalities (e.g., images and text) in neural networks. We define a novel model called gated multimodal unit (GMU), designed as an internal unit in a neural network architecture whose purpose is to find an intermediate representation based on a combination of data from different modalities.The GMU learns to decide how modalities influence the activation of the unit using multiplicative gates.The GMU can be used as a building block for different kinds of neural networks and can be seen as a form of intermediate fusion. The model was evaluated on two multimodal learning tasks in conjunction with fully connected and convolutional neural networks. We compare the GMU with other early- and late-fusion methods, outperforming classification scores in two benchmark datasets: MM-IMDb and DeepScene.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper considers the problem of leveraging multiple sources of information or data modalities (e.g., images and text) in neural networks. We define a novel model called gated multimodal unit (GMU), designed as an internal unit in a neural network architecture whose purpose is to find an intermediate representation based on a combination of data from different modalities.The GMU learns to decide how modalities influence the activation of the unit using multiplicative gates.The GMU can be used as a building block for different kinds of neural networks and can be seen as a form of intermediate fusion. The model was evaluated on two multimodal learning tasks in conjunction with fully connected and convolutional neural networks. We compare the GMU with other early- and late-fusion methods, outperforming classification scores in two benchmark datasets: MM-IMDb and DeepScene. |
2019 |
Maharjan, Suraj; Mave, Deepthi; Shrestha, Prasha; Montes, Manuel; Gonzalez, Fabio A; Solorio, Thamar Jointly Learning Author and Annotated Character N-gram Embeddings: A Case Study in Literary Text Conference Forthcoming In Proceedings of the 2019 Conference on Recent Advances in Natural Language Processing (RANLP), ACL, Varna, Bulgaria, Forthcoming. Abstract | BibTeX | Tags: Authorship Attribution, Book Likability Prediction, Multitask, Neural Language Model, Transfer learning @conference{Maharjan2019, title = {Jointly Learning Author and Annotated Character N-gram Embeddings: A Case Study in Literary Text}, author = {Suraj Maharjan and Deepthi Mave and Prasha Shrestha and Manuel Montes and Fabio A Gonzalez and Thamar Solorio}, year = {2019}, date = {2019-09-02}, booktitle = {In Proceedings of the 2019 Conference on Recent Advances in Natural Language Processing (RANLP)}, publisher = {ACL}, address = {Varna, Bulgaria}, abstract = {An author's way of presenting a story through his/her writing style has a great impact on whether the story will be liked by readers or not. In this paper, we learn representations for authors of literary texts together with representations for character n-grams annotated with their functional roles. We train a neural character n-gram based language model using an external corpus of literary texts and transfer learned representations for use in downstream tasks. We show that augmenting the knowledge from external works of authors produces results competitive with other style-based methods for book likability prediction, genre classification, and authorship attribution.}, keywords = {Authorship Attribution, Book Likability Prediction, Multitask, Neural Language Model, Transfer learning}, pubstate = {forthcoming}, tppubtype = {conference} } An author's way of presenting a story through his/her writing style has a great impact on whether the story will be liked by readers or not. In this paper, we learn representations for authors of literary texts together with representations for character n-grams annotated with their functional roles. We train a neural character n-gram based language model using an external corpus of literary texts and transfer learned representations for use in downstream tasks. We show that augmenting the knowledge from external works of authors produces results competitive with other style-based methods for book likability prediction, genre classification, and authorship attribution. |
Kar, Sudipta; Aguilar, Gustavo; Solorio, Thamar Multi-view Characterization of Stories from Narratives and Reviews using Multi-label Ranking Online 2019, (ArXiv). Links | BibTeX | Tags: Narrative Analysis @online{Kar2019, title = {Multi-view Characterization of Stories from Narratives and Reviews using Multi-label Ranking}, author = {Sudipta Kar and Gustavo Aguilar and Thamar Solorio}, url = {https://arxiv.org/abs/1908.09083}, year = {2019}, date = {2019-08-27}, note = {ArXiv}, keywords = {Narrative Analysis}, pubstate = {published}, tppubtype = {online} } |
Shafaei, Mahsa; Samghabadi, Niloofar Safi; Kar, Sudipta; Solorio, Thamar arXiv, (Ed.): 2019, visited: 21.08.2019. Abstract | Links | BibTeX | Tags: Abusive Language detection, Sentiment analysis, Text Classification @online{Shafaei2019cb, title = {Rating for Parents: Predicting Children Suitability Rating for Movies Based on Language of the Movies}, author = {Mahsa Shafaei and Niloofar Safi Samghabadi and Sudipta Kar and Thamar Solorio}, editor = {arXiv}, url = {https://arxiv.org/abs/1908.07819}, year = {2019}, date = {2019-08-21}, urldate = {2019-08-21}, abstract = {The film culture has grown tremendously in recent years. The large number of streaming services put films as one of the most convenient forms of entertainment in today's world. Films can help us learn and inspire societal change. But they can also negatively affect viewers. In this paper, our goal is to predict the suitability of the movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We propose an RNN based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 78% weighted F1-score for the classification model that outperforms the traditional machine learning method by 6%.}, keywords = {Abusive Language detection, Sentiment analysis, Text Classification}, pubstate = {published}, tppubtype = {online} } The film culture has grown tremendously in recent years. The large number of streaming services put films as one of the most convenient forms of entertainment in today's world. Films can help us learn and inspire societal change. But they can also negatively affect viewers. In this paper, our goal is to predict the suitability of the movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We propose an RNN based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 78% weighted F1-score for the classification model that outperforms the traditional machine learning method by 6%. |
Aguilar, Gustavo; Rozgić, Viktor; Wang, Weiran; Wang, Chao Multimodal and Multi-view Models for Emotion Recognition Conference The 57th Annual Meeting of the Association for Computational Linguistics, ACL, 2019. Abstract | Links | BibTeX | Tags: acoustics, emotion recognition, language, Multimodal, multiview @conference{aguilar19_er, title = {Multimodal and Multi-view Models for Emotion Recognition}, author = {Gustavo Aguilar and Viktor Rozgić and Weiran Wang and Chao Wang}, editor = {Association for Computational Linguistics }, url = {https://arxiv.org/abs/1906.10198}, year = {2019}, date = {2019-07-28}, booktitle = {The 57th Annual Meeting of the Association for Computational Linguistics}, journal = {Association for Computational Linguistics}, publisher = {ACL}, abstract = {Studies on emotion recognition (ER) show that combining lexical and acoustic information results in more robust and accurate models. The majority of the studies focus on settings where both modalities are available in training and evaluation. However, in practice, this is not always the case; getting ASR output may represent a bottleneck in a deployment pipeline due to computational complexity or privacy-related constraints. To address this challenge, we study the problem of efficiently combining acoustic and lexical modalities during training while still providing a deployable acoustic model that does not require lexical inputs. We first experiment with multimodal models and two attention mechanisms to assess the extent of the benefits that lexical information can provide. Then, we frame the task as a multi-view learning problem to induce semantic information from a multimodal model into our acoustic-only network using a contrastive loss function. Our multimodal model outperforms the previous state of the art on the USC-IEMOCAP dataset reported on lexical and acoustic information. Additionally, our multi-view-trained acoustic network significantly surpasses models that have been exclusively trained with acoustic features.}, keywords = {acoustics, emotion recognition, language, Multimodal, multiview}, pubstate = {published}, tppubtype = {conference} } Studies on emotion recognition (ER) show that combining lexical and acoustic information results in more robust and accurate models. The majority of the studies focus on settings where both modalities are available in training and evaluation. However, in practice, this is not always the case; getting ASR output may represent a bottleneck in a deployment pipeline due to computational complexity or privacy-related constraints. To address this challenge, we study the problem of efficiently combining acoustic and lexical modalities during training while still providing a deployable acoustic model that does not require lexical inputs. We first experiment with multimodal models and two attention mechanisms to assess the extent of the benefits that lexical information can provide. Then, we frame the task as a multi-view learning problem to induce semantic information from a multimodal model into our acoustic-only network using a contrastive loss function. Our multimodal model outperforms the previous state of the art on the USC-IEMOCAP dataset reported on lexical and acoustic information. Additionally, our multi-view-trained acoustic network significantly surpasses models that have been exclusively trained with acoustic features. |
Shirani, Amirreza; Dernoncourt, Franck; Asente, Paul; Lipka, Nedim; Kim, Seokhwan; Echevarria, Jose; Solorio, Thamar Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions Conference The 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019), 2019. Abstract | Links | BibTeX | Tags: Emphasis Selection @conference{Shirani2019, title = {Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions}, author = {Amirreza Shirani and Franck Dernoncourt and Paul Asente and Nedim Lipka and Seokhwan Kim and Jose Echevarria and Thamar Solorio}, url = {https://www.aclweb.org/anthology/papers/P/P19/P19-1112/}, year = {2019}, date = {2019-06-10}, booktitle = {The 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019)}, abstract = {In visual communication, text emphasis is used to increase the comprehension of written text and to convey the author’s intent. We study the problem of emphasis selection, i.e. choosing candidates for emphasis in short written text, to enable automated design assistance in authoring. Without knowing the author’s intent and only considering the input text, multiple emphasis selections are valid. We propose a model that employs end-to-end label distribution learning (LDL) on crowd-sourced data and predicts a selection distribution, capturing the inter-subjectivity (common-sense) in the audience as well as the ambiguity of the input. We compare the model with several baselines in which the problem is transformed to single-label learning by mapping label distributions to absolute labels via majority voting.}, keywords = {Emphasis Selection}, pubstate = {published}, tppubtype = {conference} } In visual communication, text emphasis is used to increase the comprehension of written text and to convey the author’s intent. We study the problem of emphasis selection, i.e. choosing candidates for emphasis in short written text, to enable automated design assistance in authoring. Without knowing the author’s intent and only considering the input text, multiple emphasis selections are valid. We propose a model that employs end-to-end label distribution learning (LDL) on crowd-sourced data and predicts a selection distribution, capturing the inter-subjectivity (common-sense) in the audience as well as the ambiguity of the input. We compare the model with several baselines in which the problem is transformed to single-label learning by mapping label distributions to absolute labels via majority voting. |
Shafaei, Mahsa; Lopez-Monroy, Adrian Pastor; Solorio, Thamar Exploiting Textual, Visual and Product Features for Predicting the Likeability of Movies Conference The 32nd International FLAIRS Conference, 2019. Abstract | Links | BibTeX | Tags: Sentiment analysis, Text Classification @conference{Shafaei2019, title = {Exploiting Textual, Visual and Product Features for Predicting the Likeability of Movies}, author = {Mahsa Shafaei and Adrian Pastor Lopez-Monroy and Thamar Solorio}, url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS19/paper/view/18305}, year = {2019}, date = {2019-05-01}, publisher = {The 32nd International FLAIRS Conference}, abstract = {Watching movies is one of the most popular entertainments among people. Every year, a huge amount of money goes to the movie industry to release movies to the market. In this paper, we propose a multimodal model to predict the likability of movies using textual, visual and product features. With the help of these features, we capture different aspects of movies and feed them as inputs to binary and multi-class classification and regression models to predict IMDB rating of movies at early steps of production. We also propose our own dataset consisting of about 15000 movie subtitles along with their metadata and poster images. We achieve 76% and 63% weighted F1-score for binary and multiclass classification respectively, and 0.7 mean square error for the regression model. Using prediction methods and data analysis, this research helps the movie business to be more productive. }, keywords = {Sentiment analysis, Text Classification}, pubstate = {published}, tppubtype = {conference} } Watching movies is one of the most popular entertainments among people. Every year, a huge amount of money goes to the movie industry to release movies to the market. In this paper, we propose a multimodal model to predict the likability of movies using textual, visual and product features. With the help of these features, we capture different aspects of movies and feed them as inputs to binary and multi-class classification and regression models to predict IMDB rating of movies at early steps of production. We also propose our own dataset consisting of about 15000 movie subtitles along with their metadata and poster images. We achieve 76% and 63% weighted F1-score for binary and multiclass classification respectively, and 0.7 mean square error for the regression model. Using prediction methods and data analysis, this research helps the movie business to be more productive. |
Amirreza Shirani Bowen Xu, David Lo Thamar Solorio Amin Alipour Question Relatedness on Stack Overflow: The Task, Dataset, and Corpus-inspired Models Conference AAAI Reasoning for Complex Question Answering Workshop (AAAI 2019), 2019. Abstract | Links | BibTeX | Tags: Community question answering, cQA, Question relatedness, stack overflow @conference{shirani2019question, title = {Question Relatedness on Stack Overflow: The Task, Dataset, and Corpus-inspired Models}, author = {Amirreza Shirani, Bowen Xu, David Lo, Thamar Solorio, Amin Alipour}, url = {https://arxiv.org/pdf/1905.01966.pdf}, year = {2019}, date = {2019-01-03}, booktitle = {AAAI Reasoning for Complex Question Answering Workshop (AAAI 2019)}, abstract = {Domain-specific community question answering is becoming an integral part of professions. Finding related questions and answers in these communities can significantly improve the effectiveness and efficiency of information seeking. Stack Overflow is one of the most popular communities that is being used by millions of programmers. In this paper, we analyze the problem of predicting knowledge unit (question thread) relatedness in Stack Overflow. In particular, we formulate the question relatedness task as a multi-class classification problem with four degrees of relatedness. We present a large-scale dataset with more than 300K pairs. To the best of our knowledge, this dataset is the largest domain-specific dataset for Question-Question relatedness. We present the steps that we took to collect, clean, process, and assure the quality of the dataset. The proposed dataset on Stack Overflow is a useful resource to develop novel solutions, specifically data-hungry neural network models, for the prediction of relatedness in technical community question-answering forums. We adapt a neural network architecture and a traditional model for this task that effectively utilize information from different parts of knowledge units to compute the relatedness between them. These models can be used to benchmark novel models, as they perform well in our task and in a closely similar task.}, keywords = {Community question answering, cQA, Question relatedness, stack overflow}, pubstate = {published}, tppubtype = {conference} } Domain-specific community question answering is becoming an integral part of professions. Finding related questions and answers in these communities can significantly improve the effectiveness and efficiency of information seeking. Stack Overflow is one of the most popular communities that is being used by millions of programmers. In this paper, we analyze the problem of predicting knowledge unit (question thread) relatedness in Stack Overflow. In particular, we formulate the question relatedness task as a multi-class classification problem with four degrees of relatedness. We present a large-scale dataset with more than 300K pairs. To the best of our knowledge, this dataset is the largest domain-specific dataset for Question-Question relatedness. We present the steps that we took to collect, clean, process, and assure the quality of the dataset. The proposed dataset on Stack Overflow is a useful resource to develop novel solutions, specifically data-hungry neural network models, for the prediction of relatedness in technical community question-answering forums. We adapt a neural network architecture and a traditional model for this task that effectively utilize information from different parts of knowledge units to compute the relatedness between them. These models can be used to benchmark novel models, as they perform well in our task and in a closely similar task. |
2018 |
Maharjan, Suraj; Montes, Manuel; Gonzalez, Fabio A; Solorio, Thamar A Genre-Aware Attention Model to Improve the Likability Prediction of Books Proceeding In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2018. Abstract | Links | BibTeX | Tags: Genre-Aware Attention Model, Multitask @proceedings{Maharjan2018b, title = {A Genre-Aware Attention Model to Improve the Likability Prediction of Books}, author = {Suraj Maharjan and Manuel Montes and Fabio A. Gonzalez and Thamar Solorio}, url = {http://aclweb.org/anthology/D18-1375}, year = {2018}, date = {2018-11-02}, publisher = {In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, abstract = {Likability prediction of books has many uses. Readers, writers, as well as the publishing industry, can all benefit from automatic book likability prediction systems. In order to make reliable decisions, these systems need to assimilate information from different aspects of a book in a sensible way. We propose a novel multimodal neural architecture that incorporates genre supervision to assign weights to individual feature types. Our proposed method is capable of dynamically tailoring weights given to feature types based on the characteristics of each book. Our architecture achieves competitive results and even outperforms state-of-the-art for this task.}, keywords = {Genre-Aware Attention Model, Multitask}, pubstate = {published}, tppubtype = {proceedings} } Likability prediction of books has many uses. Readers, writers, as well as the publishing industry, can all benefit from automatic book likability prediction systems. In order to make reliable decisions, these systems need to assimilate information from different aspects of a book in a sensible way. We propose a novel multimodal neural architecture that incorporates genre supervision to assign weights to individual feature types. Our proposed method is capable of dynamically tailoring weights given to feature types based on the characteristics of each book. Our architecture achieves competitive results and even outperforms state-of-the-art for this task. |
Niloofar S. Samghabadi Deepthi Mave, Sudipta Kar Thamar Solorio RiTUAL-UH at TRAC 2018 Shared Task: Aggression Identification Inproceedings 2018. Abstract | Links | BibTeX | Tags: Abusive Language detection, Aggression Identification @inproceedings{safisamghabadi-EtAl:2018:TRAC1, title = {RiTUAL-UH at TRAC 2018 Shared Task: Aggression Identification}, author = {Niloofar S. Samghabadi, Deepthi Mave, Sudipta Kar, Thamar Solorio}, url = {http://www.aclweb.org/anthology/W18-4402}, year = {2018}, date = {2018-08-25}, journal = {TRAC1 @ COLING2018}, abstract = {This paper presents our system for “TRAC 2018 Shared Task on Aggression Identification”. Our best systems for the English dataset use a combination of lexical and semantic features. However, for Hindi data using only lexical features gave us the best results. We obtained weighted F1- measures of 0.5921 for the English Facebook task (ranked 12th), 0.5663 for the English Social Media task (ranked 6th), 0.6451 for the Hindi Facebook task (ranked 1st), and 0.4853 for the Hindi Social Media task (ranked 2nd).}, keywords = {Abusive Language detection, Aggression Identification}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents our system for “TRAC 2018 Shared Task on Aggression Identification”. Our best systems for the English dataset use a combination of lexical and semantic features. However, for Hindi data using only lexical features gave us the best results. We obtained weighted F1- measures of 0.5921 for the English Facebook task (ranked 12th), 0.5663 for the English Social Media task (ranked 6th), 0.6451 for the Hindi Facebook task (ranked 1st), and 0.4853 for the Hindi Social Media task (ranked 2nd). |
Kar, Sudipta; Maharjan, Suraj; Solorio, Thamar Proceedings of the 27th International Conference on Computational Linguistics, 2018. Links | BibTeX | Tags: CNN, Narrative Analysis, Sentiment analysis @conference{Kar2018b, title = {Folksonomication: Predicting Tags for Movies from Plot Synopses using Emotion Flow encoded Neural Network}, author = {Sudipta Kar and Suraj Maharjan and Thamar Solorio}, url = {http://ritual.uh.edu/folksonomication-2018}, year = {2018}, date = {2018-08-23}, booktitle = {Proceedings of the 27th International Conference on Computational Linguistics}, keywords = {CNN, Narrative Analysis, Sentiment analysis}, pubstate = {published}, tppubtype = {conference} } |
Deepthi Mave Suraj Maharjan, ; Solorio, Thamar Language Identification and Analysis of Code-Switched Social Media Text Workshop Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018, Association for Computational Linguistics, Melbourne, Australia, 2018. Abstract | Links | BibTeX | Tags: Code-Switching @workshop{dmave2018, title = {Language Identification and Analysis of Code-Switched Social Media Text}, author = {Deepthi Mave, Suraj Maharjan, and Thamar Solorio}, editor = {Association for Computational Linguistics }, url = {http://www.aclweb.org/anthology/W18-3206}, year = {2018}, date = {2018-07-19}, booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018}, publisher = {Association for Computational Linguistics}, address = {Melbourne, Australia}, abstract = {In this paper, we detail our work on comparing different word-level language identification systems for code-switched Hindi-English data and a standard Spanish-English dataset. In this regard, we build a new code-switched dataset for Hindi-English. To understand the code-switching patterns in these language pairs, we investigate different code-switching metrics. We find that the CRF model outperforms the neural network based models by a margin of 2-5 percentage points for Spanish-English and 3-5 percentage points for Hindi-English. }, keywords = {Code-Switching}, pubstate = {published}, tppubtype = {workshop} } In this paper, we detail our work on comparing different word-level language identification systems for code-switched Hindi-English data and a standard Spanish-English dataset. In this regard, we build a new code-switched dataset for Hindi-English. To understand the code-switching patterns in these language pairs, we investigate different code-switching metrics. We find that the CRF model outperforms the neural network based models by a margin of 2-5 percentage points for Spanish-English and 3-5 percentage points for Hindi-English. |
Aguilar, Gustavo; AlGhamdi, Fahad; Soto, Victor; Diab, Mona; Hirschberg, Julia; Solorio, Thamar Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task Inproceedings for Linguistics, Association Computational (Ed.): Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, Association for Computational Linguistics, Melbourne, Australia, 2018. Abstract | Links | BibTeX | Tags: Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media @inproceedings{aguilar@calcs2018, title = {Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task}, author = {Gustavo Aguilar and Fahad AlGhamdi and Victor Soto and Mona Diab and Julia Hirschberg and Thamar Solorio}, editor = {Association for Computational Linguistics }, url = {http://www.aclweb.org/anthology/W18-3219}, year = {2018}, date = {2018-07-15}, booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching}, publisher = {Association for Computational Linguistics}, address = {Melbourne, Australia}, abstract = {In the third shared task of the Computational Approaches to Linguistic CodeSwitching (CALCS) workshop, we focus on Named Entity Recognition (NER) on code-switched social-media data. We divide the shared task into two competitions based on the English-Spanish (ENG-SPA) and Modern Standard Arabic-Egyptian (MSA-EGY) language pairs. We use Twitter data and 9 entity types to establish a new dataset for code-switched NER benchmarks. In addition to the CS phenomenon, the diversity of the entities and the social media challenges make the task considerably hard to process. As a result, the best scores of the competitions are 63.76% and 71.61% for ENG-SPA and MSA-EGY, respectively. We present the scores of 9 participants and discuss the most common challenges among submissions.}, keywords = {Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media}, pubstate = {published}, tppubtype = {inproceedings} } In the third shared task of the Computational Approaches to Linguistic CodeSwitching (CALCS) workshop, we focus on Named Entity Recognition (NER) on code-switched social-media data. We divide the shared task into two competitions based on the English-Spanish (ENG-SPA) and Modern Standard Arabic-Egyptian (MSA-EGY) language pairs. We use Twitter data and 9 entity types to establish a new dataset for code-switched NER benchmarks. In addition to the CS phenomenon, the diversity of the entities and the social media challenges make the task considerably hard to process. As a result, the best scores of the competitions are 63.76% and 71.61% for ENG-SPA and MSA-EGY, respectively. We present the scores of 9 participants and discuss the most common challenges among submissions. |
López-Monroy, Pastor A; González, Fabio A; Montes-y-Gómez, Manuel; Escalante, Hugo Jair; Solorio, Thamar Early Text Classification using Multi-Resolution Concept Representations Conference The 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics , 2018. Abstract | Links | BibTeX | Tags: Text Classification @conference{pastor18, title = {Early Text Classification using Multi-Resolution Concept Representations}, author = {A. Pastor López-Monroy and Fabio A. González and Manuel Montes-y-Gómez and Hugo Jair Escalante and Thamar Solorio}, editor = {Association for Computational Linguistics }, url = {http://www.aclweb.org/anthology/N18-1110}, year = {2018}, date = {2018-06-04}, booktitle = {The 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, publisher = {Association for Computational Linguistics }, abstract = {This paper proposes a novel document representation, called Multi-Resolution Representation (MulR), to improve the early detection of risks in social media sources. The goal is to effectively identify the potential risk using as little evidence as possible and with as much anticipation as possible. MulR allows us to generate multiple ``views" of the text. These views capture different semantic meanings for words and documents at different levels of granularity, which is very useful in early scenarios to model the variable amounts of evidence. Our experimental evaluation shows that MuLR using low resolution is better suited for modeling short documents (very early stages), whereas large documents (medium/late stages) are better modeled with higher resolutions. We evaluate the proposed ideas in two different tasks where anticipation is critical: sexual predator detection and depression detection. The experimental evaluation for these early tasks revealed that the proposed approach outperforms previous methodologies by a considerable margin.}, keywords = {Text Classification}, pubstate = {published}, tppubtype = {conference} } This paper proposes a novel document representation, called Multi-Resolution Representation (MulR), to improve the early detection of risks in social media sources. The goal is to effectively identify the potential risk using as little evidence as possible and with as much anticipation as possible. MulR allows us to generate multiple ``views" of the text. These views capture different semantic meanings for words and documents at different levels of granularity, which is very useful in early scenarios to model the variable amounts of evidence. Our experimental evaluation shows that MuLR using low resolution is better suited for modeling short documents (very early stages), whereas large documents (medium/late stages) are better modeled with higher resolutions. We evaluate the proposed ideas in two different tasks where anticipation is critical: sexual predator detection and depression detection. The experimental evaluation for these early tasks revealed that the proposed approach outperforms previous methodologies by a considerable margin. |
Maharjan, Suraj; Kar, Sudipta; Montes, Manuel; Gonzalez, Fabio A; Solorio, Thamar Letting Emotions Flow: Success Prediction by Modeling the Flow of Emotions in Books Inproceedings Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, New Orleans, Louisiana, 2018. Abstract | Links | BibTeX | Tags: Attention Model, Emotion Flow, Emotion Shapes, Likability Classification, Multitask @inproceedings{Maharjan2018, title = {Letting Emotions Flow: Success Prediction by Modeling the Flow of Emotions in Books}, author = {Suraj Maharjan and Sudipta Kar and Manuel Montes and Fabio A. Gonzalez and Thamar Solorio}, url = {http://www.aclweb.org/anthology/N18-2042}, year = {2018}, date = {2018-06-01}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, publisher = {Association for Computational Linguistics}, address = {New Orleans, Louisiana}, abstract = {Books have the power to make us feel happiness, sadness, pain, surprise, or sorrow. An author's dexterity in the use of these emotions captivates readers and makes it difficult for them to put the book down. In this paper, we model the flow of emotions over a book using recurrent neural networks and quantify its usefulness in predicting the book's success. We obtained the best weighted F1-score of 0.690 for predicting books' success in a multitask setting (simultaneously predicting success and genre of books)}, keywords = {Attention Model, Emotion Flow, Emotion Shapes, Likability Classification, Multitask}, pubstate = {published}, tppubtype = {inproceedings} } Books have the power to make us feel happiness, sadness, pain, surprise, or sorrow. An author's dexterity in the use of these emotions captivates readers and makes it difficult for them to put the book down. In this paper, we model the flow of emotions over a book using recurrent neural networks and quantify its usefulness in predicting the book's success. We obtained the best weighted F1-score of 0.690 for predicting books' success in a multitask setting (simultaneously predicting success and genre of books) |
Aguilar, Gustavo; Monroy, Pastor López A; Gonzalez, Fabio A; Solorio, Thamar Modeling Noisiness to Recognize Named Entities using Multitask Neural Networks on Social Media Inproceedings for Linguistics, Association Computational (Ed.): Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, New Orleans, Louisiana, 2018. Abstract | Links | BibTeX | Tags: CRF, Multitask, NER, Phonetics, Phonology, Social Media @inproceedings{gaguilar2018, title = {Modeling Noisiness to Recognize Named Entities using Multitask Neural Networks on Social Media}, author = {Gustavo Aguilar and A. Pastor López Monroy and Fabio A. Gonzalez and Thamar Solorio}, editor = {Association for Computational Linguistics }, url = {http://www.aclweb.org/anthology/N18-1127}, year = {2018}, date = {2018-06-01}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, publisher = {Association for Computational Linguistics}, address = {New Orleans, Louisiana}, abstract = {Recognizing named entities in a document is a key task in many NLP applications. Although current state-of-the-art approaches to this task reach a high performance on clean text (e.g. newswire genres), those algorithms dramatically degrade when they are moved to noisy environments such as social media domains. We present two systems that address the challenges of processing social media data using character-level phonetics and phonology, word embeddings, and Part-of-Speech tags as features. The first model is a multitask end-to-end Bidirectional Long Short-Term Memory (BLSTM)-Conditional Random Field (CRF) network whose output layer contains two CRF classifiers. The second model uses a multitask BLSTM network as feature extractor that transfers the learning to a CRF classifier for the final prediction. Our systems outperform the current F1 scores from state-of-the-art on the Workshop on Noisy User-generated Text 2017 dataset by 2.45% and 3.69%, establishing a more suitable approach for social media environments. }, keywords = {CRF, Multitask, NER, Phonetics, Phonology, Social Media}, pubstate = {published}, tppubtype = {inproceedings} } Recognizing named entities in a document is a key task in many NLP applications. Although current state-of-the-art approaches to this task reach a high performance on clean text (e.g. newswire genres), those algorithms dramatically degrade when they are moved to noisy environments such as social media domains. We present two systems that address the challenges of processing social media data using character-level phonetics and phonology, word embeddings, and Part-of-Speech tags as features. The first model is a multitask end-to-end Bidirectional Long Short-Term Memory (BLSTM)-Conditional Random Field (CRF) network whose output layer contains two CRF classifiers. The second model uses a multitask BLSTM network as feature extractor that transfers the learning to a CRF classifier for the final prediction. Our systems outperform the current F1 scores from state-of-the-art on the Workshop on Noisy User-generated Text 2017 dataset by 2.45% and 3.69%, establishing a more suitable approach for social media environments. |
Kar, Sudipta; Maharjan, Suraj; López-Monroy, Pastor A; Solorio, Thamar MPST: A Corpus of Movie Plot Synopses with Tags Conference Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), European Language Resources Association (ELRA), 2018. Abstract | Links | BibTeX | Tags: Information Extraction, Narrative Analysis, Sentiment analysis, Text Classification @conference{Kar2018, title = {MPST: A Corpus of Movie Plot Synopses with Tags}, author = {Sudipta Kar and Suraj Maharjan and A. Pastor López-Monroy and Thamar Solorio}, url = {http://sudiptakar.info/wp-content/uploads/2018/05/322_LREC_2018.pdf, Slide http://sudiptakar.info/wp-content/uploads/2018/02/mpst-corpus-movie-2.pdf, Paper}, year = {2018}, date = {2018-05-10}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, publisher = {European Language Resources Association (ELRA)}, abstract = {Social tagging of movies reveals a wide range of heterogeneous information about movies, like the genre, plot structure, soundtracks, metadata, visual and emotional experiences. Such information can be valuable in building automatic systems to create tags for movies. Automatic tagging systems can help recommendation engines to improve the retrieval of similar movies as well as help viewers to know what to expect from a movie in advance. In this paper, we set out to the task of collecting a corpus of movie plot synopses and tags. We describe a methodology that enabled us to build a fine-grained set of around 70 tags exposing heterogeneous characteristics of movie plots and the multi-label associations of these tags with some 14K movie plot synopses. We investigate how these tags correlate with movies and the flow of emotions throughout different types of movies. Finally, we use this corpus to explore the feasibility of inferring tags from plot synopses. We expect the corpus will be useful in other tasks where analysis of narratives is relevant.}, keywords = {Information Extraction, Narrative Analysis, Sentiment analysis, Text Classification}, pubstate = {published}, tppubtype = {conference} } Social tagging of movies reveals a wide range of heterogeneous information about movies, like the genre, plot structure, soundtracks, metadata, visual and emotional experiences. Such information can be valuable in building automatic systems to create tags for movies. Automatic tagging systems can help recommendation engines to improve the retrieval of similar movies as well as help viewers to know what to expect from a movie in advance. In this paper, we set out to the task of collecting a corpus of movie plot synopses and tags. We describe a methodology that enabled us to build a fine-grained set of around 70 tags exposing heterogeneous characteristics of movie plots and the multi-label associations of these tags with some 14K movie plot synopses. We investigate how these tags correlate with movies and the flow of emotions throughout different types of movies. Finally, we use this corpus to explore the feasibility of inferring tags from plot synopses. We expect the corpus will be useful in other tasks where analysis of narratives is relevant. |
Amirreza Shirani Pastor Lopez-Monroy, Fabio Gonzalez Thamar Solorio Mohammad Amin Alipour Evaluation of Type Inference with Textual Cues Conference Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18), 2018. Abstract | Links | BibTeX | Tags: Java Language, SVM, Textual Features, Type Prediction @conference{Shirani2018, title = {Evaluation of Type Inference with Textual Cues}, author = {Amirreza Shirani, Pastor Lopez-Monroy, Fabio Gonzalez, Thamar Solorio, Mohammad Amin Alipour}, url = {https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues}, year = {2018}, date = {2018-02-07}, booktitle = {Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)}, abstract = {Type information plays an important role in the success of information retrieval and recommendation systems in software engineering. Thus, the absence of types in dynamically-typed languages poses a challenge to adapt these systems to support dynamic languages. In this paper, we explore the viability of type inference using textual cues. That is, we formulate the type inference problem as a classification problem which uses the textual features in the source code to predict the type of variables. In this approach, a classifier learns a model to distinguish between types of variables in a program. The model is subsequently used to (approximately) infer the types of other variables. We evaluate the feasibility of this approach on four Java projects wherein type information is already available in the source code and can be used to train and test a classifier. Our experiments show this approach can predict the type of new variables with relatively high accuracy (80% F-measure). These results suggest that textual cues can be complementary tools in inferring types for dynamic languages. (PDF) Evaluation of Type Inference with Textual Cues. Available from: https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues [accessed Sep 07 2018].}, keywords = {Java Language, SVM, Textual Features, Type Prediction}, pubstate = {published}, tppubtype = {conference} } Type information plays an important role in the success of information retrieval and recommendation systems in software engineering. Thus, the absence of types in dynamically-typed languages poses a challenge to adapt these systems to support dynamic languages. In this paper, we explore the viability of type inference using textual cues. That is, we formulate the type inference problem as a classification problem which uses the textual features in the source code to predict the type of variables. In this approach, a classifier learns a model to distinguish between types of variables in a program. The model is subsequently used to (approximately) infer the types of other variables. We evaluate the feasibility of this approach on four Java projects wherein type information is already available in the source code and can be used to train and test a classifier. Our experiments show this approach can predict the type of new variables with relatively high accuracy (80% F-measure). These results suggest that textual cues can be complementary tools in inferring types for dynamic languages. (PDF) Evaluation of Type Inference with Textual Cues. Available from: https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues [accessed Sep 07 2018]. |
Osborne, John D; Neu, Matthew B; Danila, Maria I; Solorio, Thamar; Bethard, Steven J CUILESS2016: a clinical corpus applying compositional normalization of text mentions Journal Article Journal of Biomedical Semantics, 9 (2), 2018. Links | BibTeX | Tags: Semantic Analysis @article{OsborneEtAl:18, title = {CUILESS2016: a clinical corpus applying compositional normalization of text mentions}, author = {John D Osborne and Matthew B Neu and Maria I Danila and Thamar Solorio and Steven J Bethard}, url = {http://rdcu.be/EpJq}, year = {2018}, date = {2018-01-10}, journal = {Journal of Biomedical Semantics}, volume = {9}, number = {2}, keywords = {Semantic Analysis}, pubstate = {published}, tppubtype = {article} } |
2017 |
Gustavo Aguilar Suraj Maharjan, Pastor López Monroy Thamar Solorio A A Multi-task Approach for Named Entity Recognition on Social Media Data Inproceedings Proceedings of 3rd Workshop on Noisy User-generated Text, WNUT 2017., 2017, (Ranked 1st place in the two evaluation metrics). Abstract | Links | BibTeX | Tags: CRF, Deeplearning, Multitask, NER @inproceedings{aguilar-EtAl:2017:WNUT, title = {A Multi-task Approach for Named Entity Recognition on Social Media Data}, author = {Gustavo Aguilar, Suraj Maharjan, A. Pastor López Monroy, Thamar Solorio}, url = {http://www.aclweb.org/anthology/W17-4419}, year = {2017}, date = {2017-09-07}, publisher = {Proceedings of 3rd Workshop on Noisy User-generated Text, WNUT 2017.}, abstract = {Named Entity Recognition for social media data is challenging because of its inherent noisiness. In addition to improper grammatical structures, it contains spelling inconsistencies and numerous informal abbreviations. We propose a novel multi-task approach by employing a more general secondary task of Named Entity (NE) segmentation together with the primary task of fine-grained NE categorization. The multi-task neural network architecture learns higher order feature representations from word and character sequences along with basic Part-of-Speech tags and gazetteer information. This neural network acts as a feature extractor to feed a Conditional Random Fields classifier. We were able to obtain the first position in the 3rd Workshop on Noisy User-generated Text (WNUT-2017) with a 41.86% entity F1-score and a 40.24% surface F1-score.}, note = {Ranked 1st place in the two evaluation metrics}, keywords = {CRF, Deeplearning, Multitask, NER}, pubstate = {published}, tppubtype = {inproceedings} } Named Entity Recognition for social media data is challenging because of its inherent noisiness. In addition to improper grammatical structures, it contains spelling inconsistencies and numerous informal abbreviations. We propose a novel multi-task approach by employing a more general secondary task of Named Entity (NE) segmentation together with the primary task of fine-grained NE categorization. The multi-task neural network architecture learns higher order feature representations from word and character sequences along with basic Part-of-Speech tags and gazetteer information. This neural network acts as a feature extractor to feed a Conditional Random Fields classifier. We were able to obtain the first position in the 3rd Workshop on Noisy User-generated Text (WNUT-2017) with a 41.86% entity F1-score and a 40.24% surface F1-score. |
Niloofar S. Samghabadi Suraj Maharjan, Alan Sprague Raquel Sprague Thamar Solorio D Detecting Nastiness in Social Media Inproceedings ALW1@ACL2017, 2017. Links | BibTeX | Tags: Abusive Language detection @inproceedings{safisamghabadi-EtAl:2017:ALW1, title = {Detecting Nastiness in Social Media}, author = {Niloofar S. Samghabadi, Suraj Maharjan, Alan Sprague, Raquel D. Sprague, Thamar Solorio}, url = {http://aclweb.org/anthology/W17-3010}, year = {2017}, date = {2017-08-04}, booktitle = {ALW1@ACL2017}, keywords = {Abusive Language detection}, pubstate = {published}, tppubtype = {inproceedings} } |
Kar, Sudipta; Maharjan, Suraj; Solorio, Thamar RiTUAL-UH at SemEval-2017 Task 5: Sentiment Analysis on Financial Data Using Neural Networks Inproceedings Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017), 2017, (Ranked 2nd for Subtask 2. With alternate scoring, ranked 1st in both subtask.). Abstract | Links | BibTeX | Tags: CNN, Neural Networks, Sentiment analysis @inproceedings{Kar2017, title = {RiTUAL-UH at SemEval-2017 Task 5: Sentiment Analysis on Financial Data Using Neural Networks}, author = {Sudipta Kar and Suraj Maharjan and Thamar Solorio}, url = {http://www.aclweb.org/anthology/S17-2150}, year = {2017}, date = {2017-08-03}, publisher = {Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)}, abstract = {In this paper, we present our systems for the “SemEval-2017 Task-5 on FineGrained Sentiment Analysis on Financial Microblogs and News”. In our system, we combined hand-engineered lexical, sentiment and metadata features, the representations learned from Convolutional Neural Networks (CNN) and Bidirectional Gated Recurrent Unit (Bi-GRU) with Attention model applied on top. With this architecture, we obtained weighted cosine similarity scores of 72.34% and 74.37% for subtask-1 and subtask-2, respectively. Using the official scoring system, our system ranked the second place for subtask-2 and eighth place for the subtask-1. It ranked first for both of the subtasks by the scores achieved by an alternate scoring system. .}, note = {Ranked 2nd for Subtask 2. With alternate scoring, ranked 1st in both subtask.}, keywords = {CNN, Neural Networks, Sentiment analysis}, pubstate = {published}, tppubtype = {inproceedings} } In this paper, we present our systems for the “SemEval-2017 Task-5 on FineGrained Sentiment Analysis on Financial Microblogs and News”. In our system, we combined hand-engineered lexical, sentiment and metadata features, the representations learned from Convolutional Neural Networks (CNN) and Bidirectional Gated Recurrent Unit (Bi-GRU) with Attention model applied on top. With this architecture, we obtained weighted cosine similarity scores of 72.34% and 74.37% for subtask-1 and subtask-2, respectively. Using the official scoring system, our system ranked the second place for subtask-2 and eighth place for the subtask-1. It ranked first for both of the subtasks by the scores achieved by an alternate scoring system. . |
Maharjan, Suraj; Arevalo, John; Montes, Manuel; Gonzalez, Fabio A; Solorio, Thamar A Multi-task Approach to Predict Likability of Books Inproceedings Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers, pp. 1217–1227, Association for Computational Linguistics, Valencia, Spain, 2017. Links | BibTeX | Tags: Multitask, Neural Networks @inproceedings{Maharjan2017, title = {A Multi-task Approach to Predict Likability of Books}, author = {Suraj Maharjan and John Arevalo and Manuel Montes and Fabio A. Gonzalez and Thamar Solorio}, url = {https://www.aclweb.org/anthology/E/E17/E17-1114.pdf}, year = {2017}, date = {2017-04-03}, booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers}, pages = {1217--1227}, publisher = {Association for Computational Linguistics}, address = {Valencia, Spain}, keywords = {Multitask, Neural Networks}, pubstate = {published}, tppubtype = {inproceedings} } |
Shrestha, Prasha; Sierra, Sebastian; Gonzalez, Fabio; Montes, Manuel; Rosso, Paolo; Solorio, Thamar Convolutional Neural Networks for Authorship Attribution of Short Texts Inproceedings Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers, pp. 669–674, Association for Computational Linguistics, Valencia, Spain, 2017. Links | BibTeX | Tags: Authorship Attribution, CNN @inproceedings{Shrestha2017, title = {Convolutional Neural Networks for Authorship Attribution of Short Texts}, author = { Prasha Shrestha and Sebastian Sierra and Fabio Gonzalez and Manuel Montes and Paolo Rosso and Thamar Solorio}, url = {https://www.aclweb.org/anthology/E/E17/E17-2106.pdf}, year = {2017}, date = {2017-04-03}, booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers}, pages = {669--674}, publisher = {Association for Computational Linguistics}, address = {Valencia, Spain}, keywords = {Authorship Attribution, CNN}, pubstate = {published}, tppubtype = {inproceedings} } |
John Arevalo Thamar Solorio, Manuel Montes-y-Gómez Fabio González A Gated Multimodal Units for Information Fusion Conference 5th International Conference on Learning Representations (ICLR) 2017 -Workshop Track, 2017. Links | BibTeX | Tags: Multimodal @conference{ArevaloEtAl:17, title = {Gated Multimodal Units for Information Fusion}, author = {John Arevalo, Thamar Solorio, Manuel Montes-y-Gómez, Fabio A González}, url = {https://arxiv.org/pdf/1702.01992.pdf}, year = {2017}, date = {2017-02-07}, booktitle = {5th International Conference on Learning Representations (ICLR) 2017 -Workshop Track}, keywords = {Multimodal}, pubstate = {published}, tppubtype = {conference} } |
2016 |
Attia, Mohammed; Maharjan, Suraj; Samih, Younes; Kallmeyer, Laura; Solorio, Thamar CogALex-V Shared Task: GHHH-Detecting Semantic Relations via Word Embeddings Proceeding COLING 2016, 2016. Links | BibTeX | Tags: Multitask CNN, shared task @proceedings{Attia2016, title = {CogALex-V Shared Task: GHHH-Detecting Semantic Relations via Word Embeddings}, author = { Mohammed Attia and Suraj Maharjan and Younes Samih and Laura Kallmeyer and Thamar Solorio}, url = {https://0196403f-a-62cb3a1a-s-sites.googlegroups.com/site/cogalex2016/home/accepted-papers/CogALex-V_Proceedings.pdf?attachauth=ANoY7cr0r4e68M8K-_cfjhjFKT3AWjmTqlHE03yzJDAGAPM7UGnGo0fdL-hJDw7OibRB_ez0E_mV52qLK625Y6oxQYQHy9fR_huydrJawgirWO8t-4xYv17CY26yzHq5s99haOCe5HJ5dcVFWaxAmVngOpEH345qsvxTfwAJf4IO3JJte0huRQ29NCosOhzjMGRmFexrTlzQMMl1gSVZQSpCQMxIwUbpsobLJ8NXJc_cX0THghOGZevzcxgg2WuV__z09CI2397T&attredirects=0#page=100}, year = {2016}, date = {2016-12-12}, publisher = {COLING 2016}, keywords = {Multitask CNN, shared task}, pubstate = {published}, tppubtype = {proceedings} } |
Rey-Villamizar, Nicolas; Shrestha, Prasha; Sadeque, Farig; Bethard, Steven; Pedersen, Ted; Mukherjee, Arjun; Solorio, Thamar Analysis of Anxious Word Usage on Online Health Forums Proceeding EMNLP 2016, 2016. Links | BibTeX | Tags: Clinical Applications, health forum, LIWC @proceedings{Rey-Villamizar2016b, title = {Analysis of Anxious Word Usage on Online Health Forums}, author = {Nicolas Rey-Villamizar and Prasha Shrestha and Farig Sadeque and Steven Bethard and Ted Pedersen and Arjun Mukherjee and Thamar Solorio}, url = {http://www.aclweb.org/anthology/W/W16/W16-6105.pdf}, year = {2016}, date = {2016-11-05}, publisher = {EMNLP 2016}, keywords = {Clinical Applications, health forum, LIWC}, pubstate = {published}, tppubtype = {proceedings} } |
Molina, Giovanni; Rey-Villamizar, Nicolas; Solorio, Thamar; AlGhamdi, Fahad; Ghoneim, Mahmoud; Hawwari, Abdelati; Diab, Mona Overview for the second shared task on language identification in code-switched data Proceeding Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP 2016, 2016. Links | BibTeX | Tags: Code-Switching, shared task @proceedings{Molina2016, title = {Overview for the second shared task on language identification in code-switched data}, author = {Giovanni Molina and Nicolas Rey-Villamizar and Thamar Solorio and Fahad AlGhamdi and Mahmoud Ghoneim and Abdelati Hawwari and Mona Diab}, url = {http://www.aclweb.org/anthology/W/W16/W16-5805.pdf}, year = {2016}, date = {2016-11-01}, pages = {40-49}, publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP 2016}, keywords = {Code-Switching, shared task}, pubstate = {published}, tppubtype = {proceedings} } |
Sadeque, Farig; Pedersen, Ted; Solorio, Thamar; Shrestha, Prasha; Rey-Villamizar, Nicolas; Bethard, Steven Why Do They Leave: Modeling Participation in Online Depression Forums Proceeding Conference on Empirical Methods in Natural Language Processing, 2016. Links | BibTeX | Tags: health forum @proceedings{Sadeque2016, title = {Why Do They Leave: Modeling Participation in Online Depression Forums}, author = {Farig Sadeque and Ted Pedersen and Thamar Solorio and Prasha Shrestha and Nicolas Rey-Villamizar and Steven Bethard}, url = {http://www.aclweb.org/anthology/W/W16/W16-6203.pdf}, year = {2016}, date = {2016-11-01}, publisher = {Conference on Empirical Methods in Natural Language Processing}, keywords = {health forum}, pubstate = {published}, tppubtype = {proceedings} } |
AlGhamdi, Fahad; Molina, Giovanni; Diab, Mona; Solorio, Thamar; Hawwari, Abdelati; Soto, Victor; Hirschberg, Julia Part of Speech Tagging for Code Switched Data Proceeding Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP, 2016. Links | BibTeX | Tags: Code-Switching @proceedings{AlGhamdi2016, title = {Part of Speech Tagging for Code Switched Data}, author = {Fahad AlGhamdi and Giovanni Molina and Mona Diab and Thamar Solorio and Abdelati Hawwari and Victor Soto and Julia Hirschberg}, url = {http://www.aclweb.org/anthology/W/W16/W16-5812.pdf}, year = {2016}, date = {2016-11-01}, pages = {98-107}, publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP}, keywords = {Code-Switching}, pubstate = {published}, tppubtype = {proceedings} } |
Samih, Younes; Maharjan, Suraj; Attia, Mohammed; Kallmeyer, Laura; Solorio, Thamar Multilingual Code-switching Identification via LSTM Recurrent Neural Networks Proceeding Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP, 2016. Links | BibTeX | Tags: Code-Switching, CRF, Deeplearning, Neural Networks @proceedings{Samih2016, title = {Multilingual Code-switching Identification via LSTM Recurrent Neural Networks}, author = {Younes Samih and Suraj Maharjan and Mohammed Attia and Laura Kallmeyer and Thamar Solorio}, url = {http://www.aclweb.org/anthology/W/W16/W16-5806.pdf}, year = {2016}, date = {2016-10-31}, publisher = {Proceedings of the Second Workshop on Computational Approaches to Code Switching; EMNLP}, keywords = {Code-Switching, CRF, Deeplearning, Neural Networks}, pubstate = {published}, tppubtype = {proceedings} } |
Sapkota, Upendra; Solorio, Thamar; y Gomez, Manuel Montes; Bethard, Steven Domain Adaptation for Authorship Attribution: Improved Structural Correspondence Learning Conference Association for Computational Linguistics (ACL) , ACL Berlin, Germany, 2016. Links | BibTeX | Tags: Authorship Attribution @conference{SapkotaEtAl:16, title = {Domain Adaptation for Authorship Attribution: Improved Structural Correspondence Learning}, author = {Upendra Sapkota and Thamar Solorio and Manuel Montes y Gomez and Steven Bethard}, url = {https://aclweb.org/anthology/P/P16/P16-1210.pdf}, year = {2016}, date = {2016-08-08}, booktitle = {Association for Computational Linguistics (ACL) }, address = {Berlin, Germany}, organization = {ACL}, keywords = {Authorship Attribution}, pubstate = {published}, tppubtype = {conference} } |
Franco-Salvador, Marc; Kar, Sudipta; Solorio, Thamar; Rosso, Paolo UH-PRHLT at SemEval-2016 Task 3: Combining Lexical and Semantic-based Features for Community Question Answering Inproceedings Proceedings of SemEval-2016, pp. 814-821, Association for Computational Linguistics, San Diego, California, 2016. Links | BibTeX | Tags: Question Answering, Text Classification @inproceedings{Franco-Salvador2016, title = {UH-PRHLT at SemEval-2016 Task 3: Combining Lexical and Semantic-based Features for Community Question Answering}, author = {Marc Franco-Salvador and Sudipta Kar and Thamar Solorio and Paolo Rosso}, url = {https://aclweb.org/anthology/S/S16/S16-1126.pdf}, year = {2016}, date = {2016-06-16}, booktitle = {Proceedings of SemEval-2016}, pages = {814-821}, publisher = {Association for Computational Linguistics}, address = {San Diego, California}, keywords = {Question Answering, Text Classification}, pubstate = {published}, tppubtype = {inproceedings} } |
Rey-Villamizar, Nicolas; Shrestha, Prasha; Solorio, Thamar; Sadeque, Farig; Bethard, Steven; Pedersen, Ted A Semi-supervised Approach for the CLPsych 2016 Shared Task Conference 3rd Workshop on Computational Linguistics and Clinical Psychology: From Linguistic Signal to Clinical Reality, Association for Computational Linguistics, 2016. Links | BibTeX | Tags: bayesian optimization, health forum, LIWC, shared task @conference{Rey-Villamizar2016, title = {A Semi-supervised Approach for the CLPsych 2016 Shared Task}, author = {Nicolas Rey-Villamizar and Prasha Shrestha and Thamar Solorio and Farig Sadeque and Steven Bethard and Ted Pedersen}, url = {http://anthology.aclweb.org/W/W16/W16-0322.pdf}, year = {2016}, date = {2016-06-16}, booktitle = {3rd Workshop on Computational Linguistics and Clinical Psychology: From Linguistic Signal to Clinical Reality}, pages = {171–175}, publisher = {Association for Computational Linguistics}, keywords = {bayesian optimization, health forum, LIWC, shared task}, pubstate = {published}, tppubtype = {conference} } |
Shrestha, Prasha; Bethard, Steven; Pedersen, Ted; Rey-Villamizar, Nicolas; Sadeque, Farig; Solorio, Thamar Age and Gender Prediction on Health Forum Data Conference Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), European Language Resources Association (ELRA), 2016. Links | BibTeX | Tags: health applications, Profiling @conference{ShrestaEtAl:16, title = {Age and Gender Prediction on Health Forum Data}, author = {Prasha Shrestha and Steven Bethard and Ted Pedersen and Nicolas Rey-Villamizar and Farig Sadeque and Thamar Solorio }, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/1117_Paper.pdf}, year = {2016}, date = {2016-05-24}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, journal = {Proceedings of LREC}, pages = {8}, publisher = {European Language Resources Association (ELRA)}, keywords = {health applications, Profiling}, pubstate = {published}, tppubtype = {conference} } |
2015 |
Sadeque, Farig; Solorio, Thamar; Pedersen, Ted; Shrestha, Prasha; Bethard, Steven Predicting Continued Participation in Online Health Forums Inproceedings Proceedings of the 6th International Workshop on Health Text Mining and Information Analysis (Louhi), pp. 15-20, Association for Computational Linguistics, Lisboa, Portugal, 2015. Links | BibTeX | Tags: health applications @inproceedings{Sadeque-EtAl:2015:LOUHI, title = {Predicting Continued Participation in Online Health Forums}, author = {Farig Sadeque and Thamar Solorio and Ted Pedersen and Prasha Shrestha and Steven Bethard}, url = {https://aclweb.org/anthology/W/W15/W15-2602.pdf}, year = {2015}, date = {2015-09-23}, booktitle = {Proceedings of the 6th International Workshop on Health Text Mining and Information Analysis (Louhi)}, pages = {15-20}, publisher = {Association for Computational Linguistics}, address = {Lisboa, Portugal}, keywords = {health applications}, pubstate = {published}, tppubtype = {inproceedings} } |
Maharjan, Suraj ; Solorio, Thamar Using Wide Range of Features for Author profiling Proceeding CLEF, 2015. Links | BibTeX | Tags: Profiling, shared task @proceedings{Maharjan2015, title = {Using Wide Range of Features for Author profiling}, author = {Maharjan, Suraj and Solorio, Thamar}, url = {https://pdfs.semanticscholar.org/f555/c009a623960b60b52f38a39a18fa5cbac273.pdf}, year = {2015}, date = {2015-09-08}, journal = {Proceedings of CLEF}, publisher = {CLEF}, keywords = {Profiling, shared task}, pubstate = {published}, tppubtype = {proceedings} } |
Maharjan, Suraj; Blair, Elizabeth; Bethard, Steven; Solorio, Thamar Developing Language-tagged Corpora for Code-switching Tweets Inproceedings Proceedings of The 9th Linguistic Annotation Workshop, pp. 72–84, Association for Computational Linguistics, Denver, Colorado, USA, 2015. Links | BibTeX | Tags: Code-Switching @inproceedings{maharjan-EtAl:2015:LAW, title = {Developing Language-tagged Corpora for Code-switching Tweets}, author = {Suraj Maharjan and Elizabeth Blair and Steven Bethard and Thamar Solorio}, url = {http://www.aclweb.org/anthology/W15-1608}, year = {2015}, date = {2015-06-05}, booktitle = {Proceedings of The 9th Linguistic Annotation Workshop}, pages = {72--84}, publisher = {Association for Computational Linguistics}, address = {Denver, Colorado, USA}, keywords = {Code-Switching}, pubstate = {published}, tppubtype = {inproceedings} } |
Sapkota, Upendra; Bethard, Steven; y Gomez, Manuel Montes; Solorio, Thamar Not All Character N-grams Are Created Equal: A Study in Authorship Attribution Conference Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, ACL Denver, Colorado, 2015. Links | BibTeX | Tags: Authorship Attribution @conference{SapkotaEtAl:15, title = {Not All Character N-grams Are Created Equal: A Study in Authorship Attribution}, author = {Upendra Sapkota and Steven Bethard and Manuel Montes y Gomez and Thamar Solorio}, url = {http://www.aclweb.org/anthology/N/N15/N15-1010.pdf}, year = {2015}, date = {2015-06-01}, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, pages = {93--102}, address = {Denver, Colorado}, organization = {ACL}, keywords = {Authorship Attribution}, pubstate = {published}, tppubtype = {conference} } |
Shrestha, Prasha; Solorio, Thamar Identification of Original Document by Using Textual Similarities Incollection Gelbukh, Alexander (Ed.): Computational Linguistics and Intelligent Text Processing, 9042 , pp. 643-654, Springer International Publishing, 2015, ISBN: 978-3-319-18116-5. Links | BibTeX | Tags: plagiarism @incollection{, title = {Identification of Original Document by Using Textual Similarities}, author = { Prasha Shrestha and Thamar Solorio}, editor = {Gelbukh, Alexander}, url = {http://dx.doi.org/10.1007/978-3-319-18117-2_48}, doi = {10.1007/978-3-319-18117-2_48}, isbn = {978-3-319-18116-5}, year = {2015}, date = {2015-01-01}, booktitle = {Computational Linguistics and Intelligent Text Processing}, volume = {9042}, pages = {643-654}, publisher = {Springer International Publishing}, series = {Lecture Notes in Computer Science}, keywords = {plagiarism}, pubstate = {published}, tppubtype = {incollection} } |
2014 |
Solorio, Thamar; Blair, Elizabeth; Maharjan, Suraj; Bethard, Steven; Diab, Mona; Gohneim, Mahmoud; Hawwari, Abdelati; AlGhamdi, Fahad; Hirschberg, Julia; Chang, Alison; Fung, Pascale Overview for the First Shared Task on Language Identification in Code-Switched Data Conference Proceedings of The First Workshop on Computational Approaches to Code Switching, held in conjunction with EMNLP 2014., ACL, Doha, Qatar, 2014. Links | BibTeX | Tags: Code-Switching @conference{SolorioEtAl:14, title = {Overview for the First Shared Task on Language Identification in Code-Switched Data}, author = {Thamar Solorio and Elizabeth Blair and Suraj Maharjan and Steven Bethard and Mona Diab and Mahmoud Gohneim and Abdelati Hawwari and Fahad AlGhamdi and Julia Hirschberg and Alison Chang and Pascale Fung}, url = {http://www.aclweb.org/anthology/W/W14/W14-3907.pdf}, year = {2014}, date = {2014-10-25}, booktitle = {Proceedings of The First Workshop on Computational Approaches to Code Switching, held in conjunction with EMNLP 2014.}, pages = {62--72}, publisher = {ACL}, address = {Doha, Qatar}, keywords = {Code-Switching}, pubstate = {published}, tppubtype = {conference} } |
Maharjan, Suraj; Shrestha, Prasha; Thamar, Solorio A Simple Approach to Author Profiling in MapReduce Conference CLEF, 2014. Links | BibTeX | Tags: Profiling, shared task @conference{Maharjan2014, title = {A Simple Approach to Author Profiling in MapReduce}, author = {Suraj Maharjan and Prasha Shrestha and Solorio Thamar }, url = {http://ceur-ws.org/Vol-1180/CLEF2014wn-Pan-MaharjanEt2014.pdf}, year = {2014}, date = {2014-09-15}, publisher = {CLEF}, keywords = {Profiling, shared task}, pubstate = {published}, tppubtype = {conference} } |