U.S. Airline Twitter Sentiment Analysis

Mon Feb 18, 2019

The primary objective of our project is to apply sentiment analysis to the US airline industry twitter data to understand the trends in customer perceptions about some of the biggest airline companies in the USA. Further, we will also investigate what factors drive positive and negative sentiments and how this would impact the overall industry.

Data:

We will use airline industry tweets for our analysis, leveraging Twitter datasets sourced from Kaggle.

1
2
3


%load_ext rpy2.ipython
import warnings
warnings.filterwarnings('ignore')

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


import itertools

import numpy
import pandas as pd

from wordcloud import WordCloud, STOPWORDS

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.tree import *

import matplotlib.pyplot as plt
import seaborn as sns

import re
import string

from collections import Counter

1
2
3
4
5


%%R
library(dplyr)
library(reticulate)
library(ggplot2)
library(rtweet)

U.S. Airline Twitter

Load the Data

1
2
3
4
5
6
7


nltk.download("stopwords") # Load StopWords
from nltk.corpus import stopwords
set(stopwords.words('english'))

tweets_dataset = pd.read_csv("data/US_Airline_Tweets.csv") # Load Data

# %Rpush tweets_dataset

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chiefkemist/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

1

tweets_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null int64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB

1

tweets_dataset.describe()

	tweet_id	airline_sentiment_confidence	negativereason_confidence	retweet_count
count	1.464000e+04	14640.000000	10522.000000	14640.000000
mean	5.692184e+17	0.900169	0.638298	0.082650
std	7.791112e+14	0.162830	0.330440	0.745778
min	5.675883e+17	0.335000	0.000000	0.000000
25%	5.685592e+17	0.692300	0.360600	0.000000
50%	5.694779e+17	1.000000	0.670600	0.000000
75%	5.698905e+17	1.000000	1.000000	0.000000
max	5.703106e+17	1.000000	1.000000	44.000000

1

tweets_dataset.head()

	tweet_id	airline_sentiment	airline_sentiment_confidence	negativereason	negativereason_confidence	airline	airline_sentiment_gold	name	negativereason_gold	text	tweet_coord	tweet_created	tweet_location	user_timezone
0	570306133677760513	neutral	1.0000	NaN	NaN	Virgin America	NaN	cairdin	NaN	@VirginAmerica What @dhepburn said.	NaN	2015-02-24 11:35:52 -0800	NaN	Eastern Time (US & Canada)
1	570301130888122368	positive	0.3486	NaN	0.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica plus you've added commercials t...	NaN	2015-02-24 11:15:59 -0800	NaN	Pacific Time (US & Canada)
2	570301083672813571	neutral	0.6837	NaN	NaN	Virgin America	NaN	yvonnalynn	NaN	@VirginAmerica I didn't today... Must mean I n...	NaN	2015-02-24 11:15:48 -0800	Lets Play	Central Time (US & Canada)
3	570301031407624196	negative	1.0000	Bad Flight	0.7033	Virgin America	NaN	jnardino	NaN	@VirginAmerica it's really aggressive to blast...	NaN	2015-02-24 11:15:36 -0800	NaN	Pacific Time (US & Canada)
4	570300817074462722	negative	1.0000	Can't Tell	1.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica and it's a really big bad thing...	NaN	2015-02-24 11:14:45 -0800	NaN	Pacific Time (US & Canada)

1
2
3


%%R

tweets_df <- read.csv("data/US_Airline_Tweets.csv", encoding = "UTF-8", header=TRUE, stringsAsFactors=FALSE)

Data Exploration

Data Columns:

1
2


%%R
tweets_df %>% colnames() # Columns

 [1] "tweet_id"                     "airline_sentiment"           
 [3] "airline_sentiment_confidence" "negativereason"              
 [5] "negativereason_confidence"    "airline"                     
 [7] "airline_sentiment_gold"       "name"                        
 [9] "negativereason_gold"          "retweet_count"               
[11] "text"                         "tweet_coord"                 
[13] "tweet_created"                "tweet_location"              
[15] "user_timezone"

Data Dimention / Shape:

1
2


%%R
tweets_df %>% dim() # Shape / Dimension

[1] 14640    15

Data Summary:

1
2


%%R
tweets_df %>% summary() # Data Summary

    tweet_id         airline_sentiment  airline_sentiment_confidence
 Min.   :5.676e+17   Length:14640       Min.   :0.3350              
 1st Qu.:5.686e+17   Class :character   1st Qu.:0.6923              
 Median :5.695e+17   Mode  :character   Median :1.0000              
 Mean   :5.692e+17                      Mean   :0.9002              
 3rd Qu.:5.699e+17                      3rd Qu.:1.0000              
 Max.   :5.703e+17                      Max.   :1.0000              

 negativereason     negativereason_confidence   airline         
 Length:14640       Min.   :0.000             Length:14640      
 Class :character   1st Qu.:0.361             Class :character  
 Mode  :character   Median :0.671             Mode  :character  
                    Mean   :0.638                               
                    3rd Qu.:1.000                               
                    Max.   :1.000                               
                    NA's   :4118                                
 airline_sentiment_gold     name           negativereason_gold
 Length:14640           Length:14640       Length:14640       
 Class :character       Class :character   Class :character   
 Mode  :character       Mode  :character   Mode  :character   




 retweet_count          text           tweet_coord        tweet_created     
 Min.   : 0.00000   Length:14640       Length:14640       Length:14640      
 1st Qu.: 0.00000   Class :character   Class :character   Class :character  
 Median : 0.00000   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 0.08265                                                           
 3rd Qu.: 0.00000                                                           
 Max.   :44.00000                                                           

 tweet_location     user_timezone     
 Length:14640       Length:14640      
 Class :character   Class :character  
 Mode  :character   Mode  :character

positive, negative, and neutral tweets shapes:

1
2
3
4
5
6
7
8


positive_tweets = tweets_dataset['airline_sentiment'].str.contains("positive")
negative_tweets = tweets_dataset['airline_sentiment'].str.contains("negative")
neutral_tweets = tweets_dataset['airline_sentiment'].str.contains("neutral")


print(f'positive tweets shape: {positive_tweets.shape}')
print(f'negative tweets shape: {negative_tweets.shape}')
print(f'neutral tweets shape: {neutral_tweets.shape}')

positive tweets shape: (14640,)
negative tweets shape: (14640,)
neutral tweets shape: (14640,)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


def word_cloud(content):
    wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(content))
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

Positive Tweets about Airlines Word Cloud: #

1

word_cloud(tweets_dataset.loc[tweets_dataset['airline_sentiment'] == 'positive'].text.values)

png

Neutral Tweets about Airlines Word Cloud: #

1

word_cloud(tweets_dataset.loc[tweets_dataset['airline_sentiment'] == 'neutral'].text.values)

png

Negative Tweets about Airlines Word Cloud: #

1

word_cloud(tweets_dataset.loc[tweets_dataset['airline_sentiment'] == 'negative'].text.values)

png

airlines tweets shapes:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


American = tweets_dataset['airline'].str.contains("American")
Delta = tweets_dataset['airline'].str.contains("Delta")
Southwest = tweets_dataset['airline'].str.contains("Southwest")
United = tweets_dataset['airline'].str.contains("United")
VAmerica = tweets_dataset['airline'].str.contains("Virgin America")
USair = tweets_dataset['airline'].str.contains("US Airways")

print(f'American Tweets Shape: {American.shape}')
print(f'Delta Tweets Shape: {Delta.shape}')
print(f'Southwest Tweets Shape: {Southwest.shape}')
print(f'United Tweets Shape: {United.shape}')
print(f'VAmerica Tweets Shape: {VAmerica.shape}')
print(f'USair Tweets Shape: {USair.shape}')

American Tweets Shape: (14640,)
Delta Tweets Shape: (14640,)
Southwest Tweets Shape: (14640,)
United Tweets Shape: (14640,)
VAmerica Tweets Shape: (14640,)
USair Tweets Shape: (14640,)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


%%R
tweets_sentiment_groups_df <- tweets_df %>%
    group_by(airline_sentiment) %>%
    rowwise() %>%
    mutate(SENTIMENT_COUNT = n()) %>%
    select(airline, airline_sentiment, SENTIMENT_COUNT)

# tweets_sentiment_groups_df %>% head()
    
ggplot(tweets_sentiment_groups_df, aes(x = airline, y = SENTIMENT_COUNT, fill=airline_sentiment)) +
  geom_bar(stat="identity") +
  coord_flip() +
  scale_y_continuous(name="Sentiment Tallies", labels = scales::comma) +
  ggtitle("Airline Sentiment Type Counts")

png

Train Test Split

1
2
3
4
5
6


tweets_dataset_train, tweets_dataset_test = train_test_split(
    tweets_dataset, test_size=0.2
)

print(f'tweets_dataset_train tweets shape: {tweets_dataset_train.shape}')
print(f'tweets_dataset_test tweets shape: {tweets_dataset_test.shape}')

tweets_dataset_train tweets shape: (11712, 15)
tweets_dataset_test tweets shape: (2928, 15)

Pre-Processing Data

1

dataset = tweets_dataset.copy()[['airline_sentiment', 'text']]

Dividing dataset into 3 categories by sentiment:

1
2
3


positive = dataset['airline_sentiment'].str.contains("positive")
negative = dataset['airline_sentiment'].str.contains("negative")
neutral = dataset['airline_sentiment'].str.contains("neutral")

If the categories have different number of tweets the classifier could learn to guess the most prevalent category, skewing the results, so, we balance the number of tweets in each category

We are going for 3 categories - positive, negative and neutral:

1
2
3
4
5
6
7
8


dataset_positive = dataset.copy()[dataset.airline_sentiment == 'positive'][:2363]
dataset_negative = dataset.copy()[dataset.airline_sentiment == 'negative'][:2363]
dataset_neutral = dataset.copy()[dataset.airline_sentiment == 'neutral'][:2363]

dataset = pd.concat(
    [dataset_positive, dataset_negative, dataset_neutral],
    ignore_index=True
).reset_index(drop=True)

Defining a function to process tweets for analysis Tweets contain several forms of punctuation, capitalizations, url links, etc. They need to be cleaned prior to analysis

1
2
3
4
5
6


def tweet_clean(tweet):
    tweet = ''.join(c for c in tweet if c not in string. punctuation) #Removing special characters
    tweet = re.sub(r'\d+', '', tweet)  #Removing digits
    tweet = re.sub('((www\S+)|(http\S+))', '', tweet) #Removing urls
    tweet = tweet.lower().strip()  #Convering to lower case
    return tweet

Setting up stopwords: Stop words are words that don’t convey any sentiment or feeling. Example: at, and, the

1
2
3


stop_words = nltk.corpus.stopwords.words("english")
stop_words = [''.join(c for c in s if c not in string.punctuation) for s in stop_words]
stop_words = [t.encode('utf-8') for t in stop_words]

Defining a function to remove stopwords. This function splits the tweets into individual words and removes stop words

1
2
3
4


def remove_sw(tweet, stop_words):
    text = tweet.split()
    text = ' '.join(word for word in text if word not in stop_words)
    return text

Processing Tweets:

1
2
3
4
5
6
7


processed_tweets = []
for tweet in dataset['text']:
    cleaned = tweet_clean(tweet)
    stopwords_removed = remove_sw(cleaned, stop_words)
    processed_tweets.append(stopwords_removed)

dataset['text'] = processed_tweets

Creating train and test datasets (80%-20%):

1
2
3
4
5
6
7
8


X_train, X_test, Y_train, Y_test = train_test_split(
    dataset['text'], dataset['airline_sentiment'], test_size=0.2
)

print(f'X_train tweets shape: {X_train.shape}')
print(f'X_test tweets shape: {X_test.shape}')
print(f'Y_train tweets shape: {Y_train.shape}')
print(f'Y_test tweets shape: {Y_test.shape}')

X_train tweets shape: (5671,)
X_test tweets shape: (1418,)
Y_train tweets shape: (5671,)
Y_test tweets shape: (1418,)

1
2
3
4
5
6
7
8
9


data_train = pd.DataFrame()
data_train['text'] = X_train
data_train['airline_sentiment'] = Y_train
data_train = data_train.reset_index(drop=True)

data_test = pd.DataFrame()
data_test['text'] = X_test
data_test['airline_sentiment'] = Y_test
data_test = data_test.reset_index(drop=True)

Training Naive Bayes Classifier:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72


class NaiveBayesClassifier(object):
    def __init__(self, data_train):
        self.data_train = data_train
        self.dataset_positive = data_train.copy()[data_train.airline_sentiment == 'positive']
        self.dataset_negative = data_train.copy()[data_train.airline_sentiment == 'negative']
        self.dataset_neutral = data_train.copy()[data_train.airline_sentiment == 'neutral']
    def fit(self):
        Pr_pos = dataset_positive.shape[0]/self.data_train.shape[0]
        Pr_neg = dataset_negative.shape[0]/self.data_train.shape[0]
        Pr_neu = dataset_neutral.shape[0]/self.data_train.shape[0]
        self.Prior  = (Pr_pos, Pr_neg, Pr_neu)
        self.pos_words = ' '.join(self.dataset_positive['text'].tolist()).split()
        self.neg_words = ' '.join(self.dataset_negative['text'].tolist()).split()
        self.neu_words = ' '.join(self.dataset_neutral['text'].tolist()).split()
        words_list = ' '.join(self.data_train['text'].tolist()).split()
        self.vocab = len(Counter(words_list))
        wc_pos = len(' '.join(self.dataset_positive['text'].tolist()).split())
        wc_neg = len(' '.join(self.dataset_negative['text'].tolist()).split())
        wc_neu = len(' '.join(self.dataset_neutral['text'].tolist()).split())
        self.word_count = (wc_pos, wc_neg, wc_neu)
        return self
    def predict(self, data_test):
        category = ['positive', 'negative', 'neutral']
        classification = []
        for tweet in data_test['text']:
            text = tweet.split()
            val_pos = numpy.array([])
            val_neg = numpy.array([])
            val_neu = numpy.array([])
            for word in text:
                tmp_pos = numpy.log((self.pos_words.count(word)+1)/(self.word_count[0]+self.vocab))
                tmp_neg = numpy.log((self.neg_words.count(word)+1)/(self.word_count[1]+self.vocab))
                tmp_neu = numpy.log((self.neu_words.count(word)+1)/(self.word_count[2]+self.vocab))
                val_pos = numpy.append(val_pos, tmp_pos)
                val_neg = numpy.append(val_neg, tmp_neg)
                val_neu = numpy.append(val_neu, tmp_neu)
            val_pos = numpy.log(self.Prior[0]) + numpy.sum(val_pos)
            val_neg = numpy.log(self.Prior[1]) + numpy.sum(val_neg)
            val_neu = numpy.log(self.Prior[2]) + numpy.sum(val_neu)
            probability = (val_pos, val_neg, val_neu)
            classification.append(category[numpy.argmax(probability)])
        return classification
    def score(self, feature, target):
        compare = []
        for i in range(0,len(feature)):
            if feature[i] == target[i]:
                tmp ='correct'
                compare.append(tmp)
            else:
                tmp ='incorrect'
                compare.append(tmp)
        r = Counter(compare)
        accuracy = r['correct']/(r['correct']+r['incorrect'])
        return accuracy

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = numpy.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.show()

Running the classifier (accuracy = 78%)

1
2
3
4
5


run = NaiveBayesClassifier(data_train)
run = run.fit()
predict = run.predict(data_test)
score = run.score(predict,data_test.airline_sentiment.tolist())
print(f'Naive Bayes Classifier Score: {score}')

Naive Bayes Classifier Score: 0.7954866008462623

Confusion Matrix:

1
2
3


class_labels = ['Positive', 'Negative', 'Neutral']
cm = confusion_matrix(data_test['airline_sentiment'], predict).T
plot_confusion_matrix(cm, classes = class_labels)

[[ 0.76406534  0.16696915  0.06896552]
 [ 0.04232804  0.80952381  0.14814815]
 [ 0.02658487  0.15337423  0.8200409 ]]

png

Now let’s consider only 2 categories - positive and negative

1
2
3
4
5
6
7


dataset_positive2 = dataset.copy()[dataset.airline_sentiment == 'positive'][:2363]
dataset_negative2 = dataset.copy()[dataset.airline_sentiment == 'negative'][:2363]

dataset2 = pd.concat(
    [dataset_positive2, dataset_negative2],
    ignore_index=True
).reset_index(drop=True)

Processing tweets using functions defined earlier

1
2
3
4
5
6
7


processed_tweets2 = []
for tweet in dataset2['text']:
    cleaned = tweet_clean(tweet)
    stopwords_removed = remove_sw(cleaned, stop_words)
    processed_tweets2.append(stopwords_removed)

dataset2['text'] = processed_tweets2

Creating train and test datasets (80%-20%)

1
2
3
4
5
6
7
8


X_train2, X_test2, Y_train2, Y_test2 = train_test_split(
    dataset2['text'], dataset2['airline_sentiment'], test_size=0.2
)

print(f'X_train2 tweets shape: {X_train2.shape}')
print(f'X_test2 tweets shape: {X_test2.shape}')
print(f'Y_train2 tweets shape: {Y_train2.shape}')
print(f'Y_test2 tweets shape: {Y_test2.shape}')

X_train2 tweets shape: (3780,)
X_test2 tweets shape: (946,)
Y_train2 tweets shape: (3780,)
Y_test2 tweets shape: (946,)

1
2
3
4
5
6
7
8
9


data_train2 = pd.DataFrame()
data_train2['text'] = X_train2
data_train2['airline_sentiment'] = Y_train2
data_train2 = data_train2.reset_index(drop=True)

data_test2 = pd.DataFrame()
data_test2['text'] = X_test2
data_test2['airline_sentiment'] = Y_test2
data_test2 = data_test2.reset_index(drop=True)

Running the classifier again (accuracy = 92%)

1
2
3
4
5


run2 = NaiveBayesClassifier(data_train2)
run2 = run2.fit()
predict2 = run2.predict(data_test2)
score2 = run2.score(predict2,data_test2.airline_sentiment.tolist())
print(f'Naive Bayes Classifier Score2: {score2}')

Naive Bayes Classifier Score2: 0.9334038054968288

Creating the confusion matrix

1
2
3


class_labels2 = ['Positive', 'Negative']
cm2 = confusion_matrix(data_test2['airline_sentiment'], predict2).T
plot_confusion_matrix(cm2, classes = class_labels2)

[[ 0.9048583   0.          0.0951417 ]
 [ 0.          0.          1.        ]
 [ 0.03111111  0.          0.96888889]]

png

Training Multinomial Naive Bayes

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    '''
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

1
2
3
4


def vectorize_text(text_array):
    transformer = CountVectorizer(analyzer=text_process).fit(text_array)
    vectorized_text_array = transformer.transform(text_array)
    return vectorized_text_array

1
2
3
4
5
6
7


Txt = vectorize_text(tweets_dataset['text'])

print('Shape of Sparse Matrix: ', Txt.shape)
print('Amount of Non-Zero occurrences: ', Txt.nnz)
# Percentage of non-zero values
density = (100.0 * Txt.nnz / (Txt.shape[0] * Txt.shape[1]))
print('Density: {}'.format((density)))

Shape of Sparse Matrix:  (14640, 19717)
Amount of Non-Zero occurrences:  150578
Density: 0.05216504799747022

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


sent = tweets_dataset['airline_sentiment']

Txt_train, Txt_test, sent_train, sent_test = train_test_split(Txt, sent, test_size=0.2)

airline_sent_nb = MultinomialNB()

airline_sent_nb.fit(Txt_train, sent_train)

airline_sent_preds = airline_sent_nb.predict(Txt_test)

mat = confusion_matrix(sent_test, airline_sent_preds)
# Confusion Matrix
print(mat)
# Classification Report
print(classification_report(sent_test, airline_sent_preds))

[[1703   81   39]
 [ 347  212   53]
 [ 199   32  262]]
             precision    recall  f1-score   support

   negative       0.76      0.93      0.84      1823
    neutral       0.65      0.35      0.45       612
   positive       0.74      0.53      0.62       493

avg / total       0.73      0.74      0.72      2928

1
2


# Confusion Matrix Plot
plot_confusion_matrix(mat.T, classes = ['negative', 'neutral', 'positive'])

[[ 0.75722543  0.1542908   0.08848377]
 [ 0.24923077  0.65230769  0.09846154]
 [ 0.11016949  0.14971751  0.74011299]]

png

1

sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels = True, yticklabels = True)

<matplotlib.axes._subplots.AxesSubplot at 0x134aa1da0>

png