2022
A Pastor López-Monroy L Fernando Pardo-Sixtos, Mahsa Shafaei
Hierarchical attention and transformers for automatic movie rating Journal Article
In: Expert Systems with Applications, pp. 118164, 2022.
Abstract | Links | BibTeX | Tags:
@article{nokey,
title = {Hierarchical attention and transformers for automatic movie rating},
author = {L Fernando Pardo-Sixtos, A Pastor López-Monroy, Mahsa Shafaei, Thamar Solorio},
url = {https://www.sciencedirect.com/science/article/abs/pii/S0957417422013240},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
journal = {Expert Systems with Applications},
pages = {118164},
abstract = {The MPAA rating provides a guide for parents to decide if a movie is suitable for their children, and determines who is allowed into movie screenings. If the assigned rating does not match with that intended by the movie makers, the movie has to go through extra changes. Predicting this rating from the movie scripts would allow for the changes to be done even before the shooting starts, when they are the cheapest. Furthermore, automatizing this reviewing process would allow for cheaper large scale classification of videos from other sources, such as social media and streaming platforms. In this paper we propose RNN and Transformer based hierarchical architecture well suited to analyze movie scripts as large text sequences. The proposed RNN architecture outperforms the State-of-the-art (SOTA) by around 3 points in the F1 score, while our Hierarchical Transformer outperformed the SOTA in around 5 points …},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Franck Dernoncourt Siva Uday Sampreeth Chebolu, Nedim Lipka
Survey of Aspect-based Sentiment Analysis Datasets Journal Article
In: arXiv e-prints, pp. arXiv: 2204.05232, 2022.
Abstract | Links | BibTeX | Tags: Sentiment analysis
@article{nokey,
title = {Survey of Aspect-based Sentiment Analysis Datasets},
author = {Siva Uday Sampreeth Chebolu, Franck Dernoncourt, Nedim Lipka, Thamar Solorio},
url = {https://ui.adsabs.harvard.edu/abs/2022arXiv220405232U/abstract},
year = {2022},
date = {2022-04-01},
journal = {arXiv e-prints},
pages = {arXiv: 2204.05232},
abstract = {Aspect-based sentiment analysis (ABSA) is a natural language processing problem that requires analyzing user-generated reviews in order to determine: a) The target entity being reviewed, b) The high-level aspect to which it belongs, and c) The sentiment expressed toward the targets and the aspects. Numerous yet scattered corpora for ABSA make it difficult for researchers to quickly identify corpora best suited for a specific ABSA subtask. This study aims to present a database of corpora that can be used to train and assess autonomous ABSA systems. Additionally, we provide an overview of the major corpora concerning the various ABSA and its subtasks and highlight several corpus features that researchers should consider when selecting a corpus. We conclude that further large-scale ABSA corpora are required. Additionally, because each corpus is constructed differently, it is time-consuming for researchers to },
keywords = {Sentiment analysis},
pubstate = {published},
tppubtype = {article}
}
Gustavo Aguilar Shuguang Chen, Anirudh Srinivasan
CALCS 2021 Shared Task: Machine Translation for Code-Switched Data Journal Article
In: arXiv preprint arXiv:2202.09625, 2022.
Abstract | Links | BibTeX | Tags: Code-Switching
@article{nokey,
title = {CALCS 2021 Shared Task: Machine Translation for Code-Switched Data},
author = {Shuguang Chen, Gustavo Aguilar, Anirudh Srinivasan, Mona Diab, Thamar Solorio},
url = {https://arxiv.org/pdf/2202.09625.pdf},
year = {2022},
date = {2022-02-19},
urldate = {2022-02-19},
journal = {arXiv preprint arXiv:2202.09625},
abstract = {To date, efforts in the code-switching literature have focused for the most part on language identification, POS, NER, and syntactic parsing. In this paper, we address machine translation for code-switched social media data. We create a community shared task. We provide two modalities for participation: supervised and unsupervised. For the supervised setting, participants are challenged to translate English into Hindi-English (Eng-Hinglish) in a single direction. For the unsupervised setting, we provide the following language pairs: English and Spanish-English (Eng-Spanglish), and English and Modern Standard Arabic-Egyptian Arabic (Eng-MSAEA) in both directions. We share insights and challenges in curating the "into" code-switching language evaluation data. Further, we provide baselines for all language pairs in the shared task. The leaderboard for the shared task comprises 12 individual system submissions corresponding to 5 different teams. The best performance achieved is 12.67% BLEU score for English to Hinglish and 25.72% BLEU score for MSAEA to English.},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {article}
}
Paolo Rosso Siva Uday Sampreeth Chebolu, Sudipta Kar
Survey on Aspect Category Detection Conference
ACM Computing Surveys (CSUR), ACM, 2022.
Abstract | Links | BibTeX | Tags: explanatory analysis
@conference{CSUR,
title = {Survey on Aspect Category Detection},
author = {Siva Uday Sampreeth Chebolu, Paolo Rosso, Sudipta Kar, Thamar Solorio},
url = {https://dl.acm.org/doi/abs/10.1145/3544557},
year = {2022},
date = {2022-01-01},
booktitle = {ACM Computing Surveys (CSUR)},
publisher = {ACM},
abstract = {In recent years, aspect category detection has become popular due to the rapid growth in customer reviews data on e-commerce and other online platforms. Aspect Category Detection, a sub-task of Aspect-Based Sentiment Analysis, categorizes the reviews based on the features of a product such as a laptop’s display, or an aspect of an entity such as the restaurant’s ambiance. Various methods have been proposed to deal with such a problem. In this paper, we first introduce several datasets in the community that deal with this task and take a closer look at them by providing some exploratory analysis. Then, we review a number of representative methods for aspect category detection and classify them into two main groups: 1) supervised learning, and 2) unsupervised learning. Next, we discuss the strengths and weaknesses of different kinds of methods, which are expected to benefit both practical applications and …},
keywords = {explanatory analysis},
pubstate = {published},
tppubtype = {conference}
}
2021
Raga Shalini Koka Farah Naz Chowdhury, Mohammad Rajiur Rahman
Identifying Keyword Predictors in Lecture Video Screen Text Conference
2021 IEEE International Symposium on Multimedia (ISM) , IEEE, 2021.
Abstract | Links | BibTeX | Tags:
@conference{nokey,
title = {Identifying Keyword Predictors in Lecture Video Screen Text},
author = {Farah Naz Chowdhury, Raga Shalini Koka, Mohammad Rajiur Rahman, Thamar Solorio, Jaspal Subhlok},
url = {http://www.videopoints.org/public/paper/Identifying_Keyword_Predictors_in_Lecture_Video_Screen_Text.pdf},
year = {2021},
date = {2021-11-01},
booktitle = {2021 IEEE International Symposium on Multimedia (ISM)
},
pages = {281-286},
publisher = {IEEE},
abstract = {Automatic discovery of keywords for lecture video segments is an important component of advanced navigation systems for lecture videos. The suitability of a word or a short phrase to be a keyword depends on various factors, including the frequency in a segment, relative frequency in reference to the full video, font size, time on screen, and the existence in domain and language dictionaries. The research presented in this paper provides a refined understanding of how various factors contribute to predicting keywords based on logistic regression analysis. The analysis employs a real-world dataset consisting of lecture videos from Biology, Computer Science, and Chemistry, hosted on Videopoints, a lecture video management portal. Term frequency, maximum font size, and presence in a domain dictionary were identified as the most important predictors of keywords. The results provide a scientific foundation and …},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Seunghyun Kim Afsaneh Razi, Ashwaq Alsoubai
Proceedings of the ACM on Human-Computer Interaction , ACM, 2021.
Abstract | Links | BibTeX | Tags:
@conference{nokey,
title = {A Human-Centered Systematic Literature Review of the Computational Approaches for Online Sexual Risk Detection},
author = {Afsaneh Razi, Seunghyun Kim, Ashwaq Alsoubai, Gianluca Stringhini, Thamar Solorio, Munmun De Choudhury, Pamela J Wisniewsk},
url = {https://stirlab.org/wp-content/uploads/Review-of-the-Computational-Approaches-for-Online-Sexual-Risk-Detection.pdf},
year = {2021},
date = {2021-10-18},
booktitle = {Proceedings of the ACM on Human-Computer Interaction
},
journal = {Proceedings of the ACM on Human-Computer Interaction},
pages = {1-38},
publisher = {ACM},
abstract = {In the era of big data and artificial intelligence, online risk detection has become a popular research topic. From detecting online harassment to the sexual predation of youth, the state-of-the-art in computational risk detection has the potential to protect particularly vulnerable populations from online victimization. Yet, this is a high-risk, high-reward endeavor that requires a systematic and human-centered approach to synthesize disparate bodies of research across different application domains, so that we can identify best practices, potential gaps, and set a strategic research agenda for leveraging these approaches in a way that betters society. Therefore, we conducted a comprehensive literature review to analyze 73 peer-reviewed articles on computational approaches utilizing text or meta-data/multimedia for online sexual risk detection. We identified sexual grooming (75%), sex trafficking (12%), and sexual …},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Franck Dernoncourt Siva Uday Sampreeth Chebolu, Nedim Lipka
Exploring Conditional Text Generation for Aspect-Based Sentiment Analysis Journal Article
In: arXiv e-prints, pp. arXiv: 2110.02334, 2021.
Abstract | Links | BibTeX | Tags: Sentiment analysis
@article{arXive-prints,
title = {Exploring Conditional Text Generation for Aspect-Based Sentiment Analysis},
author = {Siva Uday Sampreeth Chebolu, Franck Dernoncourt, Nedim Lipka, Thamar Solorio
},
url = {https://ui.adsabs.harvard.edu/abs/2021arXiv211002334U/abstract},
year = {2021},
date = {2021-10-01},
urldate = {2021-10-01},
journal = {arXiv e-prints},
pages = {arXiv: 2110.02334},
abstract = {Aspect-based sentiment analysis (ABSA) is an NLP task that entails processing user-generated reviews to determine (i) the target being evaluated,(ii) the aspect category to which it belongs, and (iii) the sentiment expressed towards the target and aspect pair. In this article, we propose transforming ABSA into an abstract summary-like conditional text generation task that uses targets, aspects, and polarities to generate auxiliary statements. To demonstrate the efficacy of our task formulation and a proposed system, we fine-tune a pre-trained model for conditional text generation tasks to get new state-of-the-art results on a few restaurant domains and urban neighborhoods domain benchmark datasets.},
keywords = {Sentiment analysis},
pubstate = {published},
tppubtype = {article}
}
Gustavo Aguilar Anjani Dhrangadhariya, Thamar Solorio
End-to-end fine-grained neural entity recognition of patients, interventions, outcomes Conference
International Conference of the Cross-Language Evaluation Forum for European Languages, Springer, Cham, 2021.
Abstract | Links | BibTeX | Tags: NER
@conference{,
title = {End-to-end fine-grained neural entity recognition of patients, interventions, outcomes},
author = {Anjani Dhrangadhariya, Gustavo Aguilar, Thamar Solorio, Roger Hilfiker, Henning Müller},
url = {https://arodes.hes-so.ch/record/8949/files/Author%20postprint.pdf},
year = {2021},
date = {2021-09-21},
urldate = {2021-09-21},
booktitle = {International Conference of the Cross-Language Evaluation Forum for European Languages},
pages = {65-77},
publisher = {Springer, Cham},
abstract = {PICO recognition is an information extraction task for detecting parts of text describing Participant (P), Intervention (I), Comparator (C), and Outcome (O) (PICO elements) in clinical trial literature. Each PICO description is further decomposed into finer semantic units. For example, in the sentence ‘The study involved 242 adult men with back pain.’, the phrase ‘242 adult men with back pain’ describes the participant, but this coarse-grained description is further divided into finer semantic units. The term ‘242’ shows “sample size” of the participants, ‘adult’ shows “age”, ‘men’ shows “sex”, and ‘back pain’ show the participant “condition”. Recognizing these fine-grained PICO entities in health literature is a challenging named-entity recognition (NER) task but it can help to fully automate systematic reviews (SR). Previous approaches concentrated on coarse-grained PICO recognition but focus on the fine-grained },
keywords = {NER},
pubstate = {published},
tppubtype = {conference}
}
Mahsa Shafaei Yigeng Zhang, Fabio Gonzalez
From None to Severe: Predicting Severity in Movie Scripts Journal Article
In: 2021.
Abstract | Links | BibTeX | Tags: Multitask
@article{arXivpreprintarXiv:2109.09276,
title = {From None to Severe: Predicting Severity in Movie Scripts},
author = {Yigeng Zhang, Mahsa Shafaei, Fabio Gonzalez, Thamar Solorio
},
url = {https://arxiv.org/pdf/2109.09276.pdf},
year = {2021},
date = {2021-09-20},
abstract = {In this paper, we introduce the task of predicting severity of age-restricted aspects of movie content based solely on the dialogue script. We first investigate categorizing the ordinal severity of movies on 5 aspects: Sex, Violence, Profanity, Substance consumption, and Frightening scenes. The problem is handled using a siamese network-based multitask framework which concurrently improves the interpretability of the predictions. The experimental results show that our method outperforms the previous state-of-the-art model and provides useful information to interpret model predictions. The proposed dataset and source code are publicly available at our GitHub repository.},
keywords = {Multitask},
pubstate = {published},
tppubtype = {article}
}
Gustavo Aguilar Shuguang Chen, Leonardo Neves
Data augmentation for cross-domain named entity recognition Journal Article
In: 2021.
Abstract | Links | BibTeX | Tags: NER
@article{arXivpreprintarXiv:2109.01758,
title = {Data augmentation for cross-domain named entity recognition},
author = {Shuguang Chen, Gustavo Aguilar, Leonardo Neves, Thamar Solorio},
url = {https://arxiv.org/pdf/2109.01758.pdf},
year = {2021},
date = {2021-09-04},
urldate = {2021-09-04},
abstract = {Current work in named entity recognition (NER) shows that data augmentation techniques can produce more robust models. However, most existing techniques focus on augmenting in-domain data in low-resource scenarios where annotated data is quite limited. In contrast, we study cross-domain data augmentation for the NER task. We investigate the possibility of leveraging data from high-resource domains by projecting it into the low-resource domains. Specifically, we propose a novel neural architecture to transform the data representation from a high-resource to a low-resource domain by learning the patterns (e.g. style, noise, abbreviations, etc.) in the text that differentiate them and a shared feature space where both domains are aligned. We experiment with diverse datasets and show that transforming the data to the low-resource domain representation achieves significant improvements over only using data from high-resource domains.},
keywords = {NER},
pubstate = {published},
tppubtype = {article}
}
Christos Smailis Mahsa Shafaei, Ioannis Kakadiaris
Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021), 2021.
Abstract | Links | BibTeX | Tags: Deeplearning
@conference{(RANLP2021),
title = {A Case Study of Deep Learning-Based Multi-Modal Methods for Labeling the Presence of Questionable Content in Movie Trailers},
author = {Mahsa Shafaei, Christos Smailis, Ioannis Kakadiaris, Thamar Solorio},
url = {https://aclanthology.org/2021.ranlp-1.146.pdf},
year = {2021},
date = {2021-09-01},
urldate = {2021-09-01},
booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
pages = {1297-1307},
abstract = {In this work, we explore different approaches to combine modalities for the problem of automated age-suitability rating of movie trailers. First, we introduce a new dataset containing videos of movie trailers in English downloaded from IMDB and YouTube, along with their corresponding age-suitability rating labels. Secondly, we propose a multi-modal deep learning pipeline addressing the movie trailer age suitability rating problem. This is the first attempt to combine video, audio, and speech information for this problem, and our experimental results show that multi-modal approaches significantly outperform the best mono and bimodal models in this task.},
keywords = {Deeplearning},
pubstate = {published},
tppubtype = {conference}
}
Giai Tran Amirreza Shirani, Hieu Trinh
PSED: A Dataset for Selecting Emphasis in Presentation Slides Conference
Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021 , ACL, 2021.
Abstract | Links | BibTeX | Tags:
@conference{,
title = {PSED: A Dataset for Selecting Emphasis in Presentation Slides},
author = {Amirreza Shirani, Giai Tran, Hieu Trinh, Franck Dernoncourt, Nedim Lipka, Jose Echevarria, Thamar Solorio, Paul Asente},
url = {https://aclanthology.org/2021.findings-acl.377.pdf},
year = {2021},
date = {2021-08-01},
urldate = {2021-08-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021
},
pages = {4314-4320},
publisher = {ACL},
abstract = {Emphasizing words in presentation slides allows viewers to direct their gaze to focal points without reading the entire slide, retaining their attention on the speaker. Despite many studies on automatic slide generation, few have addressed helping authors choose which words to emphasize. Motivated by this, we study the problem of choosing candidates for emphasis by introducing a new dataset containing presentation slides with a wide variety of topics. We evaluated a range of state-of-the-art models on this novel dataset by organizing a shared task and inviting multiple researchers to model emphasis in slides.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Parikh, Dwija; Solorio, Thamar
Normalization and Back-Transliteration for Code-Switched Data Inproceedings
In: Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching, pp. 119–124, ACL, 2021.
Links | BibTeX | Tags: Code-Switching
@inproceedings{parikh-solorio-2021-normalization,
title = {Normalization and Back-Transliteration for Code-Switched Data},
author = {Dwija Parikh and Thamar Solorio},
url = {https://aclanthology.org/2021.calcs-1.15},
doi = {10.18653/v1/2021.calcs-1.15},
year = {2021},
date = {2021-06-11},
booktitle = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
pages = {119--124},
publisher = {ACL},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {inproceedings}
}
Shuguang Chen Thamar Solorio, Alan W Black
Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching Conference
Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching, 2021.
Abstract | Links | BibTeX | Tags: Code-Switching
@conference{-Switching,
title = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
author = {Thamar Solorio, Shuguang Chen, Alan W Black, Mona Diab, Sunayana Sitaram, Victor Soto, Emre Yilmaz, Anirudh Srinivasan},
url = {https://aclanthology.org/2021.calcs-1.0.pdf},
year = {2021},
date = {2021-06-01},
urldate = {2021-06-01},
booktitle = {Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching},
abstract = {Bienvenidos to the proceedings of the fifth edition of the workshop on computational approaches for linguistic code-switching (CALCS-2021)! Code-switching is this very interesting phenomenon where multilingual speakers communicate by moving back and forth between the languages they speak when communicating with other multilingual speakers. Code-switching (CSW) is predominantly used in speech but since it also tends to be more prevalent in casual settings, we can observe CSW in genres like social media platforms where interactions tend to be more casual.
However interesting, our current NLP technology is lagging behind in the development of resources and methodologies that can effectively process code-switched language. This is true for even the large multilingual pretrained models such as mBERT and BART. At the same time, the growing adoption of smart devices and automated assistants that rely on speech interfaces, makes it even more pressing that our field addresses CSW language data},
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {conference}
}
However interesting, our current NLP technology is lagging behind in the development of resources and methodologies that can effectively process code-switched language. This is true for even the large multilingual pretrained models such as mBERT and BART. At the same time, the growing adoption of smart devices and automated assistants that rely on speech interfaces, makes it even more pressing that our field addresses CSW language data
Chen, Shuguang; Neves, Leonardo; Solorio, Thamar
Mitigating Temporal-Drift: A Simple Approach to Keep NER Models Crisp Conference
Ninth International Workshop on Natural Language Processing for Social Media (SocialNLP @ NAACL 2021), 2021.
Abstract | Links | BibTeX | Tags: NER, Social Media
@conference{Chen2021,
title = {Mitigating Temporal-Drift: A Simple Approach to Keep NER Models Crisp},
author = {Shuguang Chen and Leonardo Neves and Thamar Solorio},
url = {https://arxiv.org/abs/2104.09742
https://github.com/RiTUAL-UH/trending_NER},
year = {2021},
date = {2021-04-19},
publisher = {Ninth International Workshop on Natural Language Processing for Social Media (SocialNLP @ NAACL 2021)},
abstract = {Performance of neural models for named entity recognition degrades over time, becoming stale. This degradation is due to temporal drift, the change in our target variables' statistical properties over time. This issue is especially problematic for social media data, where topics change rapidly. In order to mitigate the problem, data annotation and retraining of models is common. Despite its usefulness, this process is expensive and time-consuming, which motivates new research on efficient model updating. In this paper, we propose an intuitive approach to measure the potential trendiness of tweets and use this metric to select the most informative instances to use for training. We conduct experiments on three state-of-the-art models on the Temporal Twitter Dataset. Our approach shows larger increases in prediction accuracy with less training data than the alternatives, making it an attractive, practical solution.},
keywords = {NER, Social Media},
pubstate = {published},
tppubtype = {conference}
}
Solorio, Thamar; Shafaei, Mahsa; Smailis, Christos; Bushman, Brad J.; Gentile, Douglas A.; Scharrer, Erica; Stockdale, Laura; Kakadiaris, Ioannis
White Paper – Objectionable Online Content: What is harmful, to whom, and why Online
2021.
Abstract | Links | BibTeX | Tags: Objectionable Online Content
@online{Solorio2021,
title = {White Paper - Objectionable Online Content: What is harmful, to whom, and why},
author = {Thamar Solorio and Mahsa Shafaei and Christos Smailis and Brad J. Bushman and Douglas A. Gentile and Erica Scharrer and Laura Stockdale and Ioannis Kakadiaris},
url = {http://ritual.uh.edu/wp-content/uploads/2021/01/W1-white-paper.pdf},
year = {2021},
date = {2021-01-28},
abstract = {This White Paper summarizes the authors' discussion regarding objectionable content for
the University of Houston (UH) Research Team to outline a strategy for building an
extensive repository of online videos to support research into automated multimodal
approaches to detect objectionable content. The workshop focused on defining what harmful
content is, to whom it is harmful, and why it is harmful.},
keywords = {Objectionable Online Content},
pubstate = {published},
tppubtype = {online}
}
the University of Houston (UH) Research Team to outline a strategy for building an
extensive repository of online videos to support research into automated multimodal
approaches to detect objectionable content. The workshop focused on defining what harmful
content is, to whom it is harmful, and why it is harmful.
Solorio, Thamar; Shafaei, Mahsa; Smailis, Christos; Augenstein, Isabelle; Mitchell, Margaret; Stapf, Ingrid; Kakadiaris, Ioannis
2021.
Abstract | Links | BibTeX | Tags: Objectionable Online Content
@online{Solorio2021b,
title = {White Paper - Creating a Repository of Objectionable Online Content: Addressing Undesirable Biases and Ethical Considerations},
author = {Thamar Solorio and Mahsa Shafaei and Christos Smailis and Isabelle Augenstein and
Margaret Mitchell and Ingrid Stapf and Ioannis Kakadiaris},
url = {http://ritual.uh.edu/wp-content/uploads/2021/01/W2-Ethics-White-Paper.pdf},
year = {2021},
date = {2021-01-28},
abstract = {This white paper summarizes the authors' structured brainstorming regarding ethical
considerations for creating an extensive repository of online content labeled with tags that
describe potentially questionable content for young viewers. The workshop focused on four
topics: 1) identifying risks for unintended biases in the data and labels, 2) how to reduce risks
for unintended biases; 3) identifying ethical considerations of the annotation task, and 4)
reducing the risks for the annotators.},
keywords = {Objectionable Online Content},
pubstate = {published},
tppubtype = {online}
}
considerations for creating an extensive repository of online content labeled with tags that
describe potentially questionable content for young viewers. The workshop focused on four
topics: 1) identifying risks for unintended biases in the data and labels, 2) how to reduce risks
for unintended biases; 3) identifying ethical considerations of the annotation task, and 4)
reducing the risks for the annotators.
Kakadiaris, Ioannis; Smailis, Christos; Shafaei, Mahsa; Escalante, Hugo Jair; Ricci, Elisa; Salah, Albert Ali; Struc, Vitomir; Solorio, Thamar
White Paper – Creating a Community of Scholars: Automatic labeling of questionable online content Online
2021.
Abstract | Links | BibTeX | Tags: Objectionable Online Content
@online{Kakadiaris2021,
title = {White Paper - Creating a Community of Scholars: Automatic labeling of questionable online content},
author = {Ioannis Kakadiaris and Christos Smailis and Mahsa Shafaei and Hugo Jair Escalante and Elisa Ricci and Albert Ali Salah and Vitomir Struc and Thamar Solorio},
url = {http://ritual.uh.edu/wp-content/uploads/2021/01/W3-White-Paper.pdf},
year = {2021},
date = {2021-01-28},
abstract = {This white paper summarizes the authors' structured brainstorming regarding creating an
extensive repository of online content labeled with tags that describe potentially questionable
content for young viewers. The workshop focused on three topics:
1. Creating a community of scholars that will contribute to the problem
2. Establishing broad definitions of what constitutes questionable content and of their
sources
3. Setting a sound and ethical approach to data collection and annotation.},
keywords = {Objectionable Online Content},
pubstate = {published},
tppubtype = {online}
}
extensive repository of online content labeled with tags that describe potentially questionable
content for young viewers. The workshop focused on three topics:
1. Creating a community of scholars that will contribute to the problem
2. Establishing broad definitions of what constitutes questionable content and of their
sources
3. Setting a sound and ethical approach to data collection and annotation.
Shirani, Amirreza; Tran, Giai; Trinh, Hieu; Dernoncourt, Franck; Lipka, Nedim; Echevarria, Jose; Solorio, Thamar; Asente, Paul
PSED: A Dataset for Selecting Emphasis in Presentation Slides Inproceedings
In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, pp. 4314–4320, 2021.
Links | BibTeX | Tags: Emphasis Selection
@inproceedings{shirani-etal-2021-psed,
title = {PSED: A Dataset for Selecting Emphasis in Presentation Slides},
author = { Amirreza Shirani and Giai Tran and Hieu Trinh and Franck Dernoncourt and Nedim Lipka and Jose Echevarria and Thamar Solorio and Paul Asente},
url = {https://aclanthology.org/2021.findings-acl.377},
doi = {10.18653/v1/2021.findings-acl.377},
year = {2021},
date = {2021-01-01},
booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
pages = {4314--4320},
keywords = {Emphasis Selection},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Shirani, Amirreza; Dernoncourt, Franck; Lipka, Nedim; Asente, Paul; Echevarria, Jose; Solorio, Thamar
SemEval-2020 Task 10: Emphasis Selection for Written Text in Visual Media Conference
Proceedings of the Fourteenth Workshop on Semantic Evaluation, International Committee for Computational Linguistics, Barcelona (online), 2020.
Abstract | Links | BibTeX | Tags: Emphasis Selection, SemEval
@conference{shirani-etal-2020-semeval,
title = {SemEval-2020 Task 10: Emphasis Selection for Written Text in Visual Media},
author = {Amirreza Shirani and Franck Dernoncourt and Nedim Lipka and Paul Asente and Jose Echevarria and Thamar Solorio},
url = {https://www.aclweb.org/anthology/2020.semeval-1.184},
year = {2020},
date = {2020-12-03},
booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation},
publisher = {International Committee for Computational Linguistics},
address = {Barcelona (online)},
abstract = {In this paper, we present the main findings and compare the results of SemEval-2020 Task 10, Emphasis Selection for Written Text in Visual Media. The goal of this shared task is to design automatic methods for emphasis selection, i.e. choosing candidates for emphasis in textual content to enable automated design assistance in authoring. The main focus is on short text instances for social media, with a variety of examples, from social media posts to inspirational quotes. Participants were asked to model emphasis using plain text with no additional context from the user or other design considerations. SemEval-2020 Emphasis Selection shared task attracted 197 participants in the early phase and a total of 31 teams made submissions to this task. The highest-ranked submission achieved 0.823 Matchm score. The analysis of systems submitted to the task indicates that BERT and RoBERTa were the most common choice of pre-trained models used, and part of speech tag (POS) was the most useful feature. Full results can be found on the task's website.},
keywords = {Emphasis Selection, SemEval},
pubstate = {published},
tppubtype = {conference}
}
Farah Naz Chowdhury Raga Shalini Koka, Mohammad Rajiur Rahman
Automatic identification of keywords in lecture video segments Inproceedings
In: 2020 IEEE International Symposium on Multimedia (ISM), pp. 162-165, IEEE, 2020.
Links | BibTeX | Tags: Information Extraction
@inproceedings{KokaetAl,
title = {Automatic identification of keywords in lecture video segments},
author = {Raga Shalini Koka, Farah Naz Chowdhury, Mohammad Rajiur Rahman, Thamar Solorio, Jaspal Subhlok},
url = {https://ieeexplore.ieee.org/abstract/document/9327965/},
year = {2020},
date = {2020-12-02},
booktitle = {2020 IEEE International Symposium on Multimedia (ISM)},
pages = {162-165},
publisher = {IEEE},
keywords = {Information Extraction},
pubstate = {published},
tppubtype = {inproceedings}
}
Kar, Sudipta; Aguilar, Gustavo; Lapata, Mirella; Solorio, Thamar
Multi-view Story Characterization from Movie Plot Synopses and Reviews Conference
EMNLP 2020, ACL 2020.
Links | BibTeX | Tags: Narrative Analysis, Text Classification
@conference{Kar2020,
title = {Multi-view Story Characterization from Movie Plot Synopses and Reviews},
author = {Sudipta Kar and Gustavo Aguilar and Mirella Lapata and Thamar Solorio},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.454.pdf},
year = {2020},
date = {2020-11-16},
booktitle = {EMNLP 2020},
pages = {5629-5646},
organization = {ACL},
keywords = {Narrative Analysis, Text Classification},
pubstate = {published},
tppubtype = {conference}
}
Shirani, Amirreza; Dernoncourt, Franck; Echevarria, Jose; Asente, Paul; Lipka, Nedim; Solorio, Thamar
Let Me Choose: From Verbal Context to Font Selection Conference
Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020), Association for Computational Linguistics, 2020.
Abstract | Links | BibTeX | Tags: Font, font selection, From Verbal Context to Font Selection
@conference{shirani-etal-2020-choose,
title = {Let Me Choose: From Verbal Context to Font Selection},
author = {Amirreza Shirani and Franck Dernoncourt and Jose Echevarria and Paul Asente and Nedim Lipka and Thamar Solorio},
editor = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020)},
url = {https://www.aclweb.org/anthology/2020.acl-main.762.pdf},
doi = {10.18653/v1/2020.acl-main.762},
year = {2020},
date = {2020-07-19},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL 2020)},
publisher = {Association for Computational Linguistics},
abstract = {In this paper, we aim to learn associations between visual attributes of fonts and the verbal context of the texts they are typically applied to. Compared to related work leveraging the surrounding visual context, we choose to focus only on the input text as this can enable new applications for which the text is the only visual element in the document. We introduce a new dataset, containing examples of different topics in social media posts and ads, labeled through crowd-sourcing. Due to the subjective nature of the task, multiple fonts might be perceived as acceptable for an input text, which makes this problem challenging. To this end, we investigate different end-to-end models to learn label distributions on crowd-sourced data and capture inter-subjectivity across all annotations.},
keywords = {Font, font selection, From Verbal Context to Font Selection},
pubstate = {published},
tppubtype = {conference}
}
Aguilar, Gustavo; Solorio, Thamar
From English to Code-Switching: Transfer Learning with Strong Morphological Clues Conference
The 58th Annual Meeting of the Association for Computational Linguistics, ACL, 2020.
Abstract | Links | BibTeX | Tags: Code-Switching, Transfer learning
@conference{aguilar20_cs-elmo,
title = {From English to Code-Switching: Transfer Learning with Strong Morphological Clues},
author = {Gustavo Aguilar and Thamar Solorio},
editor = {ACL},
url = {https://www.aclweb.org/anthology/2020.acl-main.716.pdf},
year = {2020},
date = {2020-06-19},
booktitle = {The 58th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
abstract = {Linguistic Code-switching (CS) is still an understudied phenomenon in natural language processing. The NLP community has mostly focused on monolingual and multi-lingual scenarios, but little attention has been given to CS in particular. This is partly because of the lack of resources and annotated data, despite its increasing occurrence in social media platforms. In this paper, we aim at adapting monolingual models to code-switched text in various tasks. Specifically, we transfer English knowledge from a pre-trained ELMo model to different code-switched language pairs (i.e., Nepali-English, Spanish-English, and Hindi-English) using the task of language identification. Our method, CS-ELMo, is an extension of ELMo with a simple yet effective position-aware attention mechanism inside its character convolutions. We show the effectiveness of this transfer learning step by outperforming multilingual BERT and homologous CS-unaware ELMo models and establishing a new state of the art in CS tasks, such as NER and POS tagging. Our technique can be expanded to more English-paired code-switched languages, providing more resources to the CS community.},
keywords = {Code-Switching, Transfer learning},
pubstate = {published},
tppubtype = {conference}
}
Aguilar, Gustavo; Kar, Sudipta; Solorio, Thamar
LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark Conference
Proceedings of the Twelfth International Conference on Language Resources and Evaluation, LREC, 2020.
Abstract | Links | BibTeX | Tags: benchmark, Code-Switching
@conference{aguilar20_lince,
title = {LinCE: A Centralized Linguistic Code-Switching Evaluation Benchmark},
author = {Gustavo Aguilar and Sudipta Kar and Thamar Solorio},
editor = {LREC},
url = {https://www.aclweb.org/anthology/2020.lrec-1.223.pdf},
year = {2020},
date = {2020-05-11},
booktitle = {Proceedings of the Twelfth International Conference on Language Resources and Evaluation},
publisher = {LREC},
abstract = {Recent trends in NLP research have raised an interest in linguistic code-switching (CS); modern approaches have been proposed to solve a wide range of NLP tasks on multiple language pairs. Unfortunately, these proposed methods are hardly generalizable to different code-switched languages. In addition, it is unclear whether a model architecture is applicable for a different task while still being compatible with the code-switching setting. This is mainly because of the lack of a centralized benchmark and the sparse corpora that researchers employ based on their specific needs and interests. To facilitate research in this direction, we propose a centralized benchmark for textbf{Lin}guistic textbf{C}ode-switching textbf{E}valuation (textbf{LinCE}) that combines ten corpora covering four different code-switched language pairs (i.e., Spanish-English, Nepali-English, Hindi-English, and Modern Standard Arabic-Egyptian Arabic) and four tasks (i.e., language identification, named entity recognition, part-of-speech tagging, and sentiment analysis). As part of the benchmark centralization effort, we provide an online platform at texttt{ritual.uh.edu/lince}, where researchers can submit their results while comparing with others in real-time. In addition, we provide the scores of different popular models, including LSTM, ELMo, and multilingual BERT so that the NLP community can compare against state-of-the-art systems. LinCE is a continuous effort, and we will expand it with more low-resource languages and tasks.},
keywords = {benchmark, Code-Switching},
pubstate = {published},
tppubtype = {conference}
}
Shafaei, Mahsa; Samghabadi, Niloofar Safi; Kar, Sudipta; Solorio, Thamar
Age Suitability Rating: Predicting the MPAA Rating Based on Movie Dialogues Proceeding
LREC, 2020.
Abstract | Links | BibTeX | Tags: MPAA Rating, Story Analysis, Text Classification
@proceedings{Shafaei2020,
title = {Age Suitability Rating: Predicting the MPAA Rating Based on Movie Dialogues},
author = {Mahsa Shafaei and Niloofar Safi Samghabadi and Sudipta Kar and Thamar Solorio },
url = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.166.pdf},
year = {2020},
date = {2020-05-01},
publisher = {LREC},
abstract = {Movies help us learn and inspire societal change. But they can also contain objectionable content that negatively affects viewers' behavior, especially children.
In this paper, our goal is to predict the suitability of movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We create a corpus for movie MPAA ratings and propose an RNN-based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 81% weighted F1-score for the classification model that outperforms the traditional machine learning method by 7%.},
keywords = {MPAA Rating, Story Analysis, Text Classification},
pubstate = {published},
tppubtype = {proceedings}
}
In this paper, our goal is to predict the suitability of movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We create a corpus for movie MPAA ratings and propose an RNN-based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 81% weighted F1-score for the classification model that outperforms the traditional machine learning method by 7%.
Aguilar, Gustavo; Ling, Yuan; Zhang, Yu; Yao, Benjamin; Fan, Xing; Guo, Chenlei
Knowledge Distillation from Internal Representations Conference
The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI, 2020.
Abstract | Links | BibTeX | Tags:
@conference{aguilar20_kd,
title = {Knowledge Distillation from Internal Representations},
author = {Gustavo Aguilar and Yuan Ling and Yu Zhang and Benjamin Yao and Xing Fan and Chenlei Guo},
editor = {AAAI},
url = {https://arxiv.org/pdf/1910.03723.pdf},
year = {2020},
date = {2020-02-07},
booktitle = {The Thirty-Fourth AAAI Conference on Artificial Intelligence},
publisher = {AAAI},
abstract = {Knowledge distillation is typically conducted by training a small model (the student) to mimic a large and cumbersome model (the teacher). The idea is to compress the knowledge from the teacher by using its output probabilities as soft-labels to optimize the student. However, when the teacher is considerably large, there is no guarantee that the internal knowledge of the teacher will be transferred into the student; even if the student closely matches the soft-labels, its internal representations may be considerably different. This internal mismatch can undermine the generalization capabilities originally intended to be transferred from the teacher to the student. In this paper, we propose to distill the internal representations of a large model such as BERT into a simplified version of it. We formulate two ways to distill such representations and various algorithms to conduct the distillation. We experiment with datasets from the GLUE benchmark and consistently show that adding knowledge distillation from internal representations is a more powerful method than only using soft-label distillation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
López-Monroy, A. Pasto; A.González, Fabio; Solorio, Thamar
Early author profiling on Twitter using profile features with multi-resolution Journal Article
In: Expert Systems with Applications, vol. 140, 2020, ISBN: 0957-4174.
Abstract | Links | BibTeX | Tags: Profiling
@article{PastorEtAl:20,
title = {Early author profiling on Twitter using profile features with multi-resolution},
author = {A. Pasto López-Monroy and Fabio A.González and Thamar Solorio},
url = {http://www.sciencedirect.com/science/article/pii/S095741741930627X},
doi = {https://doi.org/10.1016/j.eswa.2019.112909},
isbn = {0957-4174},
year = {2020},
date = {2020-02-01},
journal = {Expert Systems with Applications},
volume = {140},
abstract = {The Author Profiling (AP) task aims to predict demographic characteristics about the authors from documents (e.g., age, gender, native language). The research so far has focused only on forensic scenarios by performing post-analysis using all the available text evidence. This paper introduces the task of Early Author Profiling (EAP) in Twitter. The goal is to effectively recognize profiles using as few tweets as possible from the user history. The task is highly relevant to support social media analysis and different problems related to security and marketing, where prevention and anticipation is crucial. This work proposes a novel strategy that combines a state of the art representation for early text classification and specialized word-vectors for author profiling tasks. In this strategy we build prototypical features called Profile based Meta-Words, which allow us to model AP information at different levels of granularity. Our evaluation shows that the proposed methodology is well suited for profiling little text evidence (e.g., a handful of tweets) in early stages, but as more tweets become available other granularities better encode larger amounts of text in late stages. We evaluated the proposed ideas on gender and language variety identification for English and Spanish, and showed that the proposal outperforms state of the art methodologies.},
keywords = {Profiling},
pubstate = {published},
tppubtype = {article}
}
Arevalo, John; Solorio, Thamar; Montes-y-Gomez, Manuel; Gonzalez, Fabio
Gated multimodal networks Journal Article
In: Neural Computing and Applications, 2020, ISSN: 1433-3058.
Abstract | Links | BibTeX | Tags:
@article{Arevalo2020,
title = {Gated multimodal networks},
author = {John Arevalo and Thamar Solorio and Manuel Montes-y-Gomez and Fabio Gonzalez
},
url = {https://doi.org/10.1007/s00521-019-04559-1},
doi = {10.1007/s00521-019-04559-1},
issn = {1433-3058},
year = {2020},
date = {2020-01-15},
journal = {Neural Computing and Applications},
abstract = {This paper considers the problem of leveraging multiple sources of information or data modalities (e.g., images and text) in neural networks. We define a novel model called gated multimodal unit (GMU), designed as an internal unit in a neural network architecture whose purpose is to find an intermediate representation based on a combination of data from different modalities.The GMU learns to decide how modalities influence the activation of the unit using multiplicative gates.The GMU can be used as a building block for different kinds of neural networks and can be seen as a form of intermediate fusion. The model was evaluated on two multimodal learning tasks in conjunction with fully connected and convolutional neural networks. We compare the GMU with other early- and late-fusion methods, outperforming classification scores in two benchmark datasets: MM-IMDb and DeepScene.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Safi Samghabadi, Niloofar; López Monroy, Adrián Pastor; Solorio, Thamar
Detecting Early Signs of Cyberbullying in Social Media Inproceedings
In: Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying, pp. 144–149, European Language Resources Association (ELRA), Marseille, France, 2020, ISBN: 979-10-95546-56-6.
Abstract | Links | BibTeX | Tags: Abusive Language detection
@inproceedings{safi-samghabadi-etal-2020-detecting,
title = {Detecting Early Signs of Cyberbullying in Social Media},
author = {Safi Samghabadi, Niloofar and
López Monroy, Adrián Pastor and
Solorio, Thamar},
url = {https://www.aclweb.org/anthology/2020.trac-1.23},
isbn = {979-10-95546-56-6},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying},
pages = {144--149},
publisher = {European Language Resources Association (ELRA)},
address = {Marseille, France},
abstract = {Nowadays, the amount of users' activities on online social media is growing dramatically. These online environments provide excellent opportunities for communication and knowledge sharing. However, some people misuse them to harass and bully others online, a phenomenon called cyberbullying. Due to its harmful effects on people, especially youth, it is imperative to detect cyberbullying as early as possible before it causes irreparable damages to victims. Most of the relevant available resources are not explicitly designed to detect cyberbullying, but related content, such as hate speech and abusive language. In this paper, we propose a new approach to create a corpus suited for cyberbullying detection. We also investigate the possibility of designing a framework to monitor the streams of users' online messages and detects the signs of cyberbullying as early as possible.},
keywords = {Abusive Language detection},
pubstate = {published},
tppubtype = {inproceedings}
}
Safi Samghabadi, Niloofar; Patwa, Parth; PYKL, Srinivas; Mukherjee, Prerana; Das, Amitava; Solorio, Thamar
Aggression and Misogyny Detection using BERT: A Multi-Task Approach Inproceedings
In: Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying, pp. 126–131, European Language Resources Association (ELRA), Marseille, France, 2020, ISBN: 979-10-95546-56-6.
Abstract | Links | BibTeX | Tags: Abusive Language detection
@inproceedings{safi-samghabadi-etal-2020-aggression,
title = {Aggression and Misogyny Detection using BERT: A Multi-Task Approach},
author = {Safi Samghabadi, Niloofar and
Patwa, Parth and
PYKL, Srinivas and
Mukherjee, Prerana and
Das, Amitava and
Solorio, Thamar},
url = {https://www.aclweb.org/anthology/2020.trac-1.20},
isbn = {979-10-95546-56-6},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying},
pages = {126--131},
publisher = {European Language Resources Association (ELRA)},
address = {Marseille, France},
abstract = {In recent times, the focus of the NLP community has increased towards offensive language, aggression, and hate-speech detection.This paper presents our system for TRAC-2 shared task on ``Aggression Identification'' (sub-task A) and ``Misogynistic Aggression Identification'' (sub-task B). The data for this shared task is provided in three different languages - English, Hindi, and Bengali. Each data instance is annotated into one of the three aggression classes - Not Aggressive, Covertly Aggressive, Overtly Aggressive, as well as one of the two misogyny classes - Gendered and Non-Gendered. We propose an end-to-end neural model using attention on top of BERT that incorporates a multi-task learning paradigm to address both the sub-tasks simultaneously. Our team, ``na14'', scored 0.8579 weighted F1-measure on the English sub-task B and secured 3rd rank out of 15 teams for the task. The code and the model weights are publicly available at https://github.com/NiloofarSafi/TRAC-2. Keywords: Aggression, Misogyny, Abusive Language, Hate-Speech Detection, BERT, NLP, Neural Networks, Social Media},
keywords = {Abusive Language detection},
pubstate = {published},
tppubtype = {inproceedings}
}
Patwa, Parth; Aguilar, Gustavo; Kar, Sudipta; Pandey, Suraj; PYKL, Srinivas; Gambäck, Björn; Chakraborty, Tanmoy; Solorio, Thamar; Das, Amitava
SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets Inproceedings
In: Proceedings of the Fourteenth Workshop on Semantic Evaluation, pp. 774–790, International Committee for Computational Linguistics, Barcelona (online), 2020.
Abstract | Links | BibTeX | Tags: Code-Switching, Sentiment analysis
@inproceedings{patwa-etal-2020-semeval,
title = {SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets},
author = { Parth Patwa and Gustavo Aguilar and Sudipta Kar and Suraj Pandey and Srinivas PYKL and Björn Gambäck and Tanmoy Chakraborty and Thamar Solorio and Amitava Das},
url = {https://www.aclweb.org/anthology/2020.semeval-1.100},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation},
pages = {774--790},
publisher = {International Committee for Computational Linguistics},
address = {Barcelona (online)},
abstract = {In this paper, we present the results of the SemEval-2020 Task 9 on Sentiment Analysis of Code-Mixed Tweets (SentiMix 2020). We also release and describe our Hinglish (Hindi-English)and Spanglish (Spanish-English) corpora annotated with word-level language identification and sentence-level sentiment labels. These corpora are comprised of 20K and 19K examples, respectively. The sentiment labels are - Positive, Negative, and Neutral. SentiMix attracted 89 submissions in total including 61 teams that participated in the Hinglish contest and 28 submitted systems to the Spanglish competition. The best performance achieved was 75.0% F1 score for Hinglish and 80.6% F1 for Spanglish. We observe that BERT-like models and ensemble methods are the most common and successful approaches among the participants.},
keywords = {Code-Switching, Sentiment analysis},
pubstate = {published},
tppubtype = {inproceedings}
}
2019
Maharjan, Suraj; Mave, Deepthi; Shrestha, Prasha; Montes, Manuel; Gonzalez, Fabio A; Solorio, Thamar
Jointly Learning Author and Annotated Character N-gram Embeddings: A Case Study in Literary Text Conference
In Proceedings of the 2019 Conference on Recent Advances in Natural Language Processing (RANLP), ACL, Varna, Bulgaria, 2019.
Abstract | Links | BibTeX | Tags: Authorship Attribution, Book Likability Prediction, Multitask, Neural Language Model, Transfer learning
@conference{Maharjan2019,
title = {Jointly Learning Author and Annotated Character N-gram Embeddings: A Case Study in Literary Text},
author = {Suraj Maharjan and Deepthi Mave and Prasha Shrestha and Manuel Montes and Fabio A Gonzalez and Thamar Solorio},
url = {https://www.aclweb.org/anthology/R19-1080/},
year = {2019},
date = {2019-09-02},
booktitle = {In Proceedings of the 2019 Conference on Recent Advances in Natural Language Processing (RANLP)},
pages = {684-692},
publisher = {ACL},
address = {Varna, Bulgaria},
abstract = {An author's way of presenting a story through his/her writing style has a great impact on whether the story will be liked by readers or not. In this paper, we learn representations for authors of literary texts together with representations for character n-grams annotated with their functional roles. We train a neural character n-gram based language model using an external corpus of literary texts and transfer learned representations for use in downstream tasks. We show that augmenting the knowledge from external works of authors produces results competitive with other style-based methods for book likability prediction, genre classification, and authorship attribution.},
keywords = {Authorship Attribution, Book Likability Prediction, Multitask, Neural Language Model, Transfer learning},
pubstate = {published},
tppubtype = {conference}
}
Kar, Sudipta; Aguilar, Gustavo; Solorio, Thamar
Multi-view Characterization of Stories from Narratives and Reviews using Multi-label Ranking Online
2019, (ArXiv).
Links | BibTeX | Tags: Narrative Analysis
@online{Kar2019,
title = {Multi-view Characterization of Stories from Narratives and Reviews using Multi-label Ranking},
author = {Sudipta Kar and Gustavo Aguilar and Thamar Solorio},
url = {https://arxiv.org/abs/1908.09083},
year = {2019},
date = {2019-08-27},
note = {ArXiv},
keywords = {Narrative Analysis},
pubstate = {published},
tppubtype = {online}
}
Shafaei, Mahsa; Samghabadi, Niloofar Safi; Kar, Sudipta; Solorio, Thamar
arXiv, (Ed.): 2019, visited: 21.08.2019.
Abstract | Links | BibTeX | Tags: Abusive Language detection, Sentiment analysis, Text Classification
@online{Shafaei2019cb,
title = {Rating for Parents: Predicting Children Suitability Rating for Movies Based on Language of the Movies},
author = {Mahsa Shafaei and Niloofar Safi Samghabadi and Sudipta Kar and Thamar Solorio},
editor = {arXiv},
url = {https://arxiv.org/abs/1908.07819},
year = {2019},
date = {2019-08-21},
urldate = {2019-08-21},
abstract = {The film culture has grown tremendously in recent years. The large number of streaming services put films as one of the most convenient forms of entertainment in today's world. Films can help us learn and inspire societal change. But they can also negatively affect viewers. In this paper, our goal is to predict the suitability of the movie content for children and young adults based on scripts. The criterion that we use to measure suitability is the MPAA rating that is specifically designed for this purpose. We propose an RNN based architecture with attention that jointly models the genre and the emotions in the script to predict the MPAA rating. We achieve 78% weighted F1-score for the classification model that outperforms the traditional machine learning method by 6%.},
keywords = {Abusive Language detection, Sentiment analysis, Text Classification},
pubstate = {published},
tppubtype = {online}
}
Aguilar, Gustavo; Rozgić, Viktor; Wang, Weiran; Wang, Chao
Multimodal and Multi-view Models for Emotion Recognition Conference
The 57th Annual Meeting of the Association for Computational Linguistics, ACL, 2019.
Abstract | Links | BibTeX | Tags: acoustics, emotion recognition, language, Multimodal, multiview
@conference{aguilar19_er,
title = {Multimodal and Multi-view Models for Emotion Recognition},
author = {Gustavo Aguilar and Viktor Rozgić and Weiran Wang and Chao Wang},
editor = {Association for Computational Linguistics },
url = {https://arxiv.org/abs/1906.10198},
year = {2019},
date = {2019-07-28},
booktitle = {The 57th Annual Meeting of the Association for Computational Linguistics},
journal = {Association for Computational Linguistics},
publisher = {ACL},
abstract = {Studies on emotion recognition (ER) show that combining lexical and acoustic information results in more robust and accurate models. The majority of the studies focus on settings where both modalities are available in training and evaluation. However, in practice, this is not always the case; getting ASR output may represent a bottleneck in a deployment pipeline due to computational complexity or privacy-related constraints. To address this challenge, we study the problem of efficiently combining acoustic and lexical modalities during training while still providing a deployable acoustic model that does not require lexical inputs. We first experiment with multimodal models and two attention mechanisms to assess the extent of the benefits that lexical information can provide. Then, we frame the task as a multi-view learning problem to induce semantic information from a multimodal model into our acoustic-only network using a contrastive loss function. Our multimodal model outperforms the previous state of the art on the USC-IEMOCAP dataset reported on lexical and acoustic information. Additionally, our multi-view-trained acoustic network significantly surpasses models that have been exclusively trained with acoustic features.},
keywords = {acoustics, emotion recognition, language, Multimodal, multiview},
pubstate = {published},
tppubtype = {conference}
}
Shirani, Amirreza; Dernoncourt, Franck; Asente, Paul; Lipka, Nedim; Kim, Seokhwan; Echevarria, Jose; Solorio, Thamar
Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions Conference
The 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019), 2019.
Abstract | Links | BibTeX | Tags: Emphasis Selection
@conference{Shirani2019,
title = {Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions},
author = {Amirreza Shirani and Franck Dernoncourt and Paul Asente and Nedim Lipka and Seokhwan Kim and Jose Echevarria and Thamar Solorio},
url = {https://www.aclweb.org/anthology/papers/P/P19/P19-1112/},
year = {2019},
date = {2019-06-10},
booktitle = {The 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019)},
abstract = {In visual communication, text emphasis is used to increase the comprehension of written text and to convey the author’s intent. We study the problem of emphasis selection, i.e. choosing candidates for emphasis in short written text, to enable automated design assistance in authoring. Without knowing the author’s intent and only considering the input text, multiple emphasis selections are valid. We propose a model that employs end-to-end label distribution learning (LDL) on crowd-sourced data and predicts a selection distribution, capturing the inter-subjectivity (common-sense) in the audience as well as the ambiguity of the input. We compare the model with several baselines in which the problem is transformed to single-label learning by mapping label distributions to absolute labels via majority voting.},
keywords = {Emphasis Selection},
pubstate = {published},
tppubtype = {conference}
}
Shafaei, Mahsa; Lopez-Monroy, Adrian Pastor; Solorio, Thamar
Exploiting Textual, Visual and Product Features for Predicting the Likeability of Movies Conference
The 32nd International FLAIRS Conference, 2019.
Abstract | Links | BibTeX | Tags: Sentiment analysis, Text Classification
@conference{Shafaei2019,
title = {Exploiting Textual, Visual and Product Features for Predicting the Likeability of Movies},
author = {Mahsa Shafaei and Adrian Pastor Lopez-Monroy and Thamar Solorio},
url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS19/paper/view/18305},
year = {2019},
date = {2019-05-01},
publisher = {The 32nd International FLAIRS Conference},
abstract = {Watching movies is one of the most popular entertainments among people. Every year, a huge amount of money goes to the movie industry to release movies to the market. In this paper, we propose a multimodal model to predict the likability of movies using textual, visual and product features. With the help of these features, we capture different aspects of movies and feed them as inputs to binary and multi-class classification and regression models to predict IMDB rating of movies at early steps of production. We also propose our own dataset consisting of about 15000 movie subtitles along with their metadata and poster images. We achieve 76% and 63% weighted F1-score for binary and multiclass classification respectively, and 0.7 mean square error for the regression model. Using prediction methods and data analysis, this research helps the movie business to be more productive. },
keywords = {Sentiment analysis, Text Classification},
pubstate = {published},
tppubtype = {conference}
}
Bowen Xu Amirreza Shirani, David Lo
Question Relatedness on Stack Overflow: The Task, Dataset, and Corpus-inspired Models Conference
AAAI Reasoning for Complex Question Answering Workshop (AAAI 2019), 2019.
Abstract | Links | BibTeX | Tags: Community question answering, cQA, Question relatedness, stack overflow
@conference{shirani2019question,
title = {Question Relatedness on Stack Overflow: The Task, Dataset, and Corpus-inspired Models},
author = {Amirreza Shirani, Bowen Xu, David Lo, Thamar Solorio, Amin Alipour},
url = {https://arxiv.org/pdf/1905.01966.pdf},
year = {2019},
date = {2019-01-03},
booktitle = {AAAI Reasoning for Complex Question Answering
Workshop (AAAI 2019)},
abstract = {Domain-specific community question answering is becoming an integral part of professions. Finding related questions
and answers in these communities can significantly improve
the effectiveness and efficiency of information seeking. Stack
Overflow is one of the most popular communities that is being
used by millions of programmers. In this paper, we analyze
the problem of predicting knowledge unit (question thread)
relatedness in Stack Overflow. In particular, we formulate the
question relatedness task as a multi-class classification problem with four degrees of relatedness.
We present a large-scale dataset with more than 300K pairs.
To the best of our knowledge, this dataset is the largest
domain-specific dataset for Question-Question relatedness.
We present the steps that we took to collect, clean, process,
and assure the quality of the dataset. The proposed dataset
on Stack Overflow is a useful resource to develop novel solutions, specifically data-hungry neural network models, for the
prediction of relatedness in technical community question-answering forums.
We adapt a neural network architecture and a traditional
model for this task that effectively utilize information from
different parts of knowledge units to compute the relatedness
between them. These models can be used to benchmark novel
models, as they perform well in our task and in a closely similar task.},
keywords = {Community question answering, cQA, Question relatedness, stack overflow},
pubstate = {published},
tppubtype = {conference}
}
and answers in these communities can significantly improve
the effectiveness and efficiency of information seeking. Stack
Overflow is one of the most popular communities that is being
used by millions of programmers. In this paper, we analyze
the problem of predicting knowledge unit (question thread)
relatedness in Stack Overflow. In particular, we formulate the
question relatedness task as a multi-class classification problem with four degrees of relatedness.
We present a large-scale dataset with more than 300K pairs.
To the best of our knowledge, this dataset is the largest
domain-specific dataset for Question-Question relatedness.
We present the steps that we took to collect, clean, process,
and assure the quality of the dataset. The proposed dataset
on Stack Overflow is a useful resource to develop novel solutions, specifically data-hungry neural network models, for the
prediction of relatedness in technical community question-answering forums.
We adapt a neural network architecture and a traditional
model for this task that effectively utilize information from
different parts of knowledge units to compute the relatedness
between them. These models can be used to benchmark novel
models, as they perform well in our task and in a closely similar task.
2018
Maharjan, Suraj; Montes, Manuel; Gonzalez, Fabio A.; Solorio, Thamar
A Genre-Aware Attention Model to Improve the Likability Prediction of Books Proceeding
In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2018.
Abstract | Links | BibTeX | Tags: Genre-Aware Attention Model, Multitask
@proceedings{Maharjan2018b,
title = {A Genre-Aware Attention Model to Improve the Likability Prediction of Books},
author = {Suraj Maharjan and Manuel Montes and Fabio A. Gonzalez and Thamar Solorio},
url = {http://aclweb.org/anthology/D18-1375},
year = {2018},
date = {2018-11-02},
publisher = {In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
abstract = {Likability prediction of books has many uses. Readers, writers, as well as the publishing industry, can all benefit from automatic book likability prediction systems. In order to make reliable decisions, these systems need to assimilate information from different aspects of a book in a sensible way. We propose a novel multimodal neural architecture that incorporates genre supervision to assign weights to individual feature types. Our proposed method is capable of dynamically tailoring weights given to feature types based on the characteristics of each book. Our architecture achieves competitive results and even outperforms state-of-the-art for this task.},
keywords = {Genre-Aware Attention Model, Multitask},
pubstate = {published},
tppubtype = {proceedings}
}
Deepthi Mave Niloofar S. Samghabadi, Sudipta Kar
RiTUAL-UH at TRAC 2018 Shared Task: Aggression Identification Inproceedings
In: 2018.
Abstract | Links | BibTeX | Tags: Abusive Language detection, Aggression Identification
@inproceedings{safisamghabadi-EtAl:2018:TRAC1,
title = {RiTUAL-UH at TRAC 2018 Shared Task: Aggression Identification},
author = {Niloofar S. Samghabadi, Deepthi Mave, Sudipta Kar, Thamar Solorio},
url = {http://www.aclweb.org/anthology/W18-4402},
year = {2018},
date = {2018-08-25},
journal = {TRAC1 @ COLING2018},
abstract = {This paper presents our system for “TRAC 2018 Shared Task on Aggression Identification”. Our best systems for the English dataset use a combination of lexical and semantic features. However, for Hindi data using only lexical features gave us the best results. We obtained weighted F1- measures of 0.5921 for the English Facebook task (ranked 12th), 0.5663 for the English Social Media task (ranked 6th), 0.6451 for the Hindi Facebook task (ranked 1st), and 0.4853 for the Hindi Social Media task (ranked 2nd).},
keywords = {Abusive Language detection, Aggression Identification},
pubstate = {published},
tppubtype = {inproceedings}
}
Kar, Sudipta; Maharjan, Suraj; Solorio, Thamar
Proceedings of the 27th International Conference on Computational Linguistics, 2018.
Links | BibTeX | Tags: CNN, Narrative Analysis, Sentiment analysis
@conference{Kar2018b,
title = {Folksonomication: Predicting Tags for Movies from Plot Synopses using Emotion Flow encoded Neural Network},
author = {Sudipta Kar and Suraj Maharjan and Thamar Solorio},
url = {http://ritual.uh.edu/folksonomication-2018},
year = {2018},
date = {2018-08-23},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
keywords = {CNN, Narrative Analysis, Sentiment analysis},
pubstate = {published},
tppubtype = {conference}
}
Suraj Maharjan Deepthi Mave,; Solorio, Thamar
Language Identification and Analysis of Code-Switched Social Media Text Workshop
Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018, Association for Computational Linguistics, Melbourne, Australia, 2018.
Abstract | Links | BibTeX | Tags: Code-Switching
@workshop{dmave2018,
title = {Language Identification and Analysis of Code-Switched Social Media Text},
author = {Deepthi Mave, Suraj Maharjan, and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/W18-3206},
year = {2018},
date = {2018-07-19},
booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, ACL 2018},
publisher = {Association for Computational Linguistics},
address = {Melbourne, Australia},
abstract = {In this paper, we detail our work on comparing different word-level language identification systems for code-switched Hindi-English data and a standard Spanish-English dataset. In this regard, we build a new code-switched dataset for Hindi-English. To understand the code-switching patterns in these language pairs, we investigate different code-switching metrics. We find that the CRF model outperforms the neural network based models by a margin of 2-5 percentage points for Spanish-English and 3-5 percentage points for Hindi-English. },
keywords = {Code-Switching},
pubstate = {published},
tppubtype = {workshop}
}
Aguilar, Gustavo; AlGhamdi, Fahad; Soto, Victor; Diab, Mona; Hirschberg, Julia; Solorio, Thamar
Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task Inproceedings
In: for Computational Linguistics, Association (Ed.): Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching, Association for Computational Linguistics, Melbourne, Australia, 2018.
Abstract | Links | BibTeX | Tags: Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media
@inproceedings{aguilar@calcs2018,
title = {Named Entity Recognition on Code-Switched Data: Overview of the CALCS 2018 Shared Task},
author = {Gustavo Aguilar and Fahad AlGhamdi and Victor Soto and Mona Diab and Julia Hirschberg and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/W18-3219},
year = {2018},
date = {2018-07-15},
booktitle = {Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching},
publisher = {Association for Computational Linguistics},
address = {Melbourne, Australia},
abstract = {In the third shared task of the Computational Approaches to Linguistic CodeSwitching (CALCS) workshop, we focus on Named Entity Recognition (NER) on code-switched social-media data. We divide the shared task into two competitions based on the English-Spanish (ENG-SPA) and Modern Standard Arabic-Egyptian (MSA-EGY) language pairs. We use Twitter data and 9 entity types to establish a new dataset for code-switched NER benchmarks. In addition to the CS phenomenon, the diversity of the entities and the social media challenges make the task considerably hard to process. As a result, the best scores of the competitions are 63.76% and 71.61% for ENG-SPA and MSA-EGY, respectively. We present the scores of 9 participants and discuss the most common challenges among submissions.},
keywords = {Code-Switching, English-Spanish, Modern Standard Arabic-Egyptian, NER, shared task, Social Media},
pubstate = {published},
tppubtype = {inproceedings}
}
López-Monroy, A. Pastor; González, Fabio A.; Montes-y-Gómez, Manuel; Escalante, Hugo Jair; Solorio, Thamar
Early Text Classification using Multi-Resolution Concept Representations Conference
The 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics , 2018.
Abstract | Links | BibTeX | Tags: Text Classification
@conference{pastor18,
title = {Early Text Classification using Multi-Resolution Concept Representations},
author = {A. Pastor López-Monroy and Fabio A. González and Manuel Montes-y-Gómez and Hugo Jair Escalante and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/N18-1110},
year = {2018},
date = {2018-06-04},
booktitle = {The 16th Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics },
abstract = {This paper proposes a novel document representation, called Multi-Resolution Representation (MulR), to improve the early detection of risks in social media sources. The goal is to effectively identify the potential risk using as little evidence as possible and with as much anticipation as possible. MulR allows us to generate multiple ``views" of the text. These views capture different semantic meanings for words and documents at different levels of granularity, which is very useful in early scenarios to model the variable amounts of evidence. Our experimental evaluation shows that MuLR using low resolution is better suited for modeling short documents (very early stages), whereas large documents (medium/late stages) are better modeled with higher resolutions. We evaluate the proposed ideas in two different tasks where anticipation is critical: sexual predator detection and depression detection. The experimental evaluation for these early tasks revealed that the proposed approach outperforms previous methodologies by a considerable margin.},
keywords = {Text Classification},
pubstate = {published},
tppubtype = {conference}
}
Maharjan, Suraj; Kar, Sudipta; Montes, Manuel; Gonzalez, Fabio A.; Solorio, Thamar
Letting Emotions Flow: Success Prediction by Modeling the Flow of Emotions in Books Inproceedings
In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, New Orleans, Louisiana, 2018.
Abstract | Links | BibTeX | Tags: Attention Model, Emotion Flow, Emotion Shapes, Likability Classification, Multitask
@inproceedings{Maharjan2018,
title = {Letting Emotions Flow: Success Prediction by Modeling the Flow of Emotions in Books},
author = {Suraj Maharjan and Sudipta Kar and Manuel Montes and Fabio A. Gonzalez and Thamar Solorio},
url = {http://www.aclweb.org/anthology/N18-2042},
year = {2018},
date = {2018-06-01},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics},
address = {New Orleans, Louisiana},
abstract = {Books have the power to make us feel happiness, sadness, pain, surprise, or sorrow. An author's dexterity in the use of these emotions captivates readers and makes it difficult for them to put the book down. In this paper, we model the flow of emotions over a book using recurrent neural networks and quantify its usefulness in predicting the book's success. We obtained the best weighted F1-score of 0.690 for predicting books' success in a multitask setting (simultaneously predicting success and genre of books)},
keywords = {Attention Model, Emotion Flow, Emotion Shapes, Likability Classification, Multitask},
pubstate = {published},
tppubtype = {inproceedings}
}
Aguilar, Gustavo; Monroy, A. Pastor López; Gonzalez, Fabio A.; Solorio, Thamar
Modeling Noisiness to Recognize Named Entities using Multitask Neural Networks on Social Media Inproceedings
In: for Computational Linguistics, Association (Ed.): Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics, New Orleans, Louisiana, 2018.
Abstract | Links | BibTeX | Tags: CRF, Multitask, NER, Phonetics, Phonology, Social Media
@inproceedings{gaguilar2018,
title = {Modeling Noisiness to Recognize Named Entities using Multitask Neural Networks on Social Media},
author = {Gustavo Aguilar and A. Pastor López Monroy and Fabio A. Gonzalez and Thamar Solorio},
editor = {Association for Computational Linguistics },
url = {http://www.aclweb.org/anthology/N18-1127},
year = {2018},
date = {2018-06-01},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics},
address = {New Orleans, Louisiana},
abstract = {Recognizing named entities in a document is a key task in many NLP applications. Although current state-of-the-art approaches to this task reach a high performance on clean text (e.g. newswire genres), those algorithms dramatically degrade when they are moved to noisy environments such as social media domains. We present two systems that address the challenges of processing social media data using character-level phonetics and phonology, word embeddings, and Part-of-Speech tags as features. The first model is a multitask end-to-end Bidirectional Long Short-Term Memory (BLSTM)-Conditional Random Field (CRF) network whose output layer contains two CRF classifiers. The second model uses a multitask BLSTM network as feature extractor that transfers the learning to a CRF classifier for the final prediction. Our systems outperform the current F1 scores from state-of-the-art on the Workshop on Noisy User-generated Text 2017 dataset by 2.45% and 3.69%, establishing a more suitable approach for social media environments. },
keywords = {CRF, Multitask, NER, Phonetics, Phonology, Social Media},
pubstate = {published},
tppubtype = {inproceedings}
}
Kar, Sudipta; Maharjan, Suraj; López-Monroy, A. Pastor; Solorio, Thamar
MPST: A Corpus of Movie Plot Synopses with Tags Conference
Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), European Language Resources Association (ELRA), 2018.
Abstract | Links | BibTeX | Tags: Information Extraction, Narrative Analysis, Sentiment analysis, Text Classification
@conference{Kar2018,
title = {MPST: A Corpus of Movie Plot Synopses with Tags},
author = {Sudipta Kar and Suraj Maharjan and A. Pastor López-Monroy and Thamar Solorio},
url = {http://sudiptakar.info/wp-content/uploads/2018/05/322_LREC_2018.pdf, Slide
http://sudiptakar.info/wp-content/uploads/2018/02/mpst-corpus-movie-2.pdf, Paper},
year = {2018},
date = {2018-05-10},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
publisher = {European Language Resources Association (ELRA)},
abstract = {Social tagging of movies reveals a wide range of heterogeneous information about movies, like the genre, plot structure, soundtracks, metadata, visual and emotional experiences. Such information can be valuable in building automatic systems to create tags for movies. Automatic tagging systems can help recommendation engines to improve the retrieval of similar movies as well as help viewers to know what to expect from a movie in advance. In this paper, we set out to the task of collecting a corpus of movie plot synopses and tags. We describe a methodology that enabled us to build a fine-grained set of around 70 tags exposing heterogeneous characteristics of movie plots and the multi-label associations of these tags with some 14K movie plot synopses. We investigate how these tags correlate with movies and the flow of emotions throughout different types of movies. Finally, we use this corpus to explore the feasibility of inferring tags from plot synopses. We expect the corpus will be useful in other tasks where analysis of narratives is relevant.},
keywords = {Information Extraction, Narrative Analysis, Sentiment analysis, Text Classification},
pubstate = {published},
tppubtype = {conference}
}
Pastor Lopez-Monroy Amirreza Shirani, Fabio Gonzalez
Evaluation of Type Inference with Textual Cues Conference
Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18), 2018.
Abstract | Links | BibTeX | Tags: Java Language, SVM, Textual Features, Type Prediction
@conference{Shirani2018,
title = {Evaluation of Type Inference with Textual Cues},
author = {Amirreza Shirani, Pastor Lopez-Monroy, Fabio Gonzalez, Thamar Solorio, Mohammad Amin Alipour},
url = {https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues},
year = {2018},
date = {2018-02-07},
booktitle = {Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)},
abstract = {Type information plays an important role in the success of information retrieval and recommendation systems in software engineering. Thus, the absence of types in dynamically-typed languages poses a challenge to adapt these systems to support dynamic languages. In this paper, we explore the viability of type inference using textual cues. That is, we formulate the type inference problem as a classification problem which uses the textual features in the source code to predict the type of variables. In this approach, a classifier learns a model to distinguish between types of variables in a program. The model is subsequently used to (approximately) infer the types of other variables. We evaluate the feasibility of this approach on four Java projects wherein type information is already available in the source code and can be used to train and test a classifier. Our experiments show this approach can predict the type of new variables with relatively high accuracy (80% F-measure). These results suggest that textual cues can be complementary tools in inferring types for dynamic languages.
(PDF) Evaluation of Type Inference with Textual Cues. Available from: https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues [accessed Sep 07 2018].},
keywords = {Java Language, SVM, Textual Features, Type Prediction},
pubstate = {published},
tppubtype = {conference}
}
(PDF) Evaluation of Type Inference with Textual Cues. Available from: https://www.researchgate.net/publication/323627639_Evaluation_of_Type_Inference_with_Textual_Cues [accessed Sep 07 2018].
Osborne, John D; Neu, Matthew B; Danila, Maria I; Solorio, Thamar; Bethard, Steven J
CUILESS2016: a clinical corpus applying compositional normalization of text mentions Journal Article
In: Journal of Biomedical Semantics, vol. 9, no. 2, 2018.
Links | BibTeX | Tags: Semantic Analysis
@article{OsborneEtAl:18,
title = {CUILESS2016: a clinical corpus applying compositional normalization of text mentions},
author = {John D Osborne and Matthew B Neu and Maria I Danila and Thamar Solorio and Steven J Bethard},
url = {http://rdcu.be/EpJq},
year = {2018},
date = {2018-01-10},
journal = {Journal of Biomedical Semantics},
volume = {9},
number = {2},
keywords = {Semantic Analysis},
pubstate = {published},
tppubtype = {article}
}