import pandas as pd
df = pd.read_csv("spam.csv", encoding="latin-1")
df.head(20)

import string
clean = df.iloc[:, :2]
clean = clean.rename(columns={'v1': 'Category', 'v2': 'Message'})
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

clean['Message'] = clean['Message'].apply(remove_punctuation)
clean['Message'] = clean['Message'].str.lower()
clean.head(5)

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(clean, test_size=0.25, random_state=42)

spam_data = train_data[train_data['Category'] == 'spam']
ham_data = train_data[train_data['Category'] == 'ham']

word_freq = {}

for message in spam_data['Message']:
    for word in message.split():
        if word not in word_freq:
            word_freq[word] = {'spam': 1, 'ham': 0}
        else:
            word_freq[word]['spam'] += 1

for message in ham_data['Message']:
    for word in message.split():
        if word not in word_freq:
            word_freq[word] = {'spam': 0, 'ham': 1}
        else:
            word_freq[word]['ham'] += 1

word_freq = pd.DataFrame(word_freq).T
word_freq.index.name = 'Word'
word_freq = word_freq.loc[word_freq.sum(axis=1).sort_values(ascending=False).index]
word_freq

from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=800, background_color='#383838').generate_from_frequencies(word_freq['spam'])
plt.figure(figsize=(4, 4), facecolor='#383838')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()

k = 0.5

num_spam = train_data['Category'].value_counts()['spam']
num_ham = train_data['Category'].value_counts()['ham']

word_prob = word_freq.copy()
word_prob['P(E|S)'] = (word_freq['spam'] + k) / (num_spam + 2*k)
word_prob['P(E|¬S)'] = (word_freq['ham'] + k) / (num_ham + 2*k)
word_prob.drop(columns=['spam', 'ham'], inplace=True)

word_prob.head()

p_spam = 0.3
p_not_spam = 0.7

p_e_given_s = word_prob.loc['offer', 'P(E|S)']
p_e_given_not_s = word_prob.loc['offer', 'P(E|¬S)']
p_s_given_e = (p_e_given_s * p_spam)/((p_e_given_s * p_spam) + (p_e_given_not_s * p_not_spam))
p_not_s_given_e = (p_e_given_not_s * p_not_spam)/((p_e_given_s * p_spam) + (p_e_given_not_s * p_not_spam))

print("Word = ['offer']")
print("P(E|S) = ", p_e_given_s)
print("P(E|¬S) = ", p_e_given_not_s)
print("P(S|E) = ", p_s_given_e)
print("P(¬S|E) = ", p_not_s_given_e)

Word = ['offer']
P(E|S) =  0.03321364452423698
P(E|¬S) =  0.0015176600441501103
P(S|E) =  0.9036533506457329
P(¬S|E) =  0.09634664935426697

from functools import reduce

#get random message from clean dataframe
message_row = clean.sample(n=1)
message = message_row['Message'].values[0]
message_words = message.split(" ")


#create dictionaries in form word: probablity being (not) spam
prob_spam = {}
prob_ham = {}

#iterate through probabilty dataframe, getting the probability of each word which is in the message that we are analysing
for word in word_prob.index:
  if word in message_words:
    prob_spam[word] = word_prob.loc[word, 'P(E|S)']
for word in word_prob.index:
  if word in message_words:
    prob_ham[word] = word_prob.loc[word, 'P(E|¬S)']


#get product of P(xi|spam) & product of P(xi|not spam)
message_given_spam = reduce((lambda x, y: x * y), prob_spam.values())
message_given_not_spam = reduce((lambda x, y: x * y), prob_ham.values())


#calculate the P(S|x1, x2, x3....,xn)
spam_given_message = (p_spam * message_given_spam) / ((p_spam * message_given_spam) + (p_not_spam * message_given_not_spam))
print("Spamliness of message: ", spam_given_message)

#caculate the P(¬S|x1, x2, x3....,xn)
not_spam_given_message = (p_not_spam * message_given_not_spam) / ((p_spam * message_given_spam) + (p_not_spam * message_given_not_spam))
print("Hamliness of message:  ", not_spam_given_message)


#calculations have same denominator, hence the numerators could also be compared to decide if a message is likely ham or spam


#decide if message is spam or not based on which probability is higher

if spam_given_message > not_spam_given_message:
  print("Message is likely spam")
else:
  print("Message is likely ham")

Spamliness of message:  0.11735332306262493
Hamliness of message:   0.882646676937375
Message is likely ham

import math as m

def spam_filter(message_row):
  message = str(message_row['Message'])
  message_words = message.split(" ")
  sum_of_log_of_message_words_given_spam = 0
  sum_of_log_of_message_words_given_ham = 0


  #get sum of log of each probability of the word being in spam/ham (sum of log(P(xi|(¬)S)))
  for word in word_prob.index:
    if word in message_words:
      prob_spam[word] = word_prob.loc[word, 'P(E|S)']
      sum_of_log_of_message_words_given_spam += m.log(prob_spam[word])
  for word in word_prob.index:
    if word in message_words:
      prob_ham[word] = word_prob.loc[word, 'P(E|¬S)']
      sum_of_log_of_message_words_given_ham += m.log(prob_ham[word])



  log_spam = m.log(p_spam) + sum_of_log_of_message_words_given_spam
  log_ham = m.log(p_not_spam) + sum_of_log_of_message_words_given_ham
  # print("log of probablity that message is spam: ", log_spam)
  # print("log of probablity that message is ham:  ", log_ham)

  if log_spam > log_ham:
    # print("Message is likly spam")
    return "spam"
  else:
    # print("Message is likely ham")
    return "ham"
spam_filter(message_row)

'ham'

match_spam = 0
match_ham = 0
thought_ham_is_spam = 0
thought_spam_is_ham = 0
for index, row in test_data.iterrows():
  if spam_filter(row) == "spam" and row['Category'] == "spam":
    match_spam += 1
  elif spam_filter(row) == "ham" and row['Category'] == "ham":
    match_ham += 1
  elif spam_filter(row) == "spam" and row['Category'] == "ham":
    thought_ham_is_spam += 1
  elif spam_filter(row) == "ham" and row['Category'] == "spam":
    thought_spam_is_ham += 1

print("match_spam", match_spam)
print("match_ham", match_ham)
print("thought_ham_is_spam ", thought_ham_is_spam)
print("thought_spam_is_ham ", thought_spam_is_ham)
accuracy = (match_ham + match_spam) / (match_ham + match_spam + thought_ham_is_spam + thought_spam_is_ham)
print("Accuracy: ", accuracy)

match_spam 186
match_ham 1104
thought_ham_is_spam  98
thought_spam_is_ham  5
Accuracy:  0.9260588657573582

	v1	v2	Unnamed: 2	Unnamed: 3	Unnamed: 4
0	ham	Go until jurong point, crazy.. Available only ...	NaN	NaN	NaN
1	ham	Ok lar... Joking wif u oni...	NaN	NaN	NaN
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	NaN	NaN	NaN
3	ham	U dun say so early hor... U c already then say...	NaN	NaN	NaN
4	ham	Nah I don't think he goes to usf, he lives aro...	NaN	NaN	NaN
5	spam	FreeMsg Hey there darling it's been 3 week's n...	NaN	NaN	NaN
6	ham	Even my brother is not like to speak with me. ...	NaN	NaN	NaN
7	ham	As per your request 'Melle Melle (Oru Minnamin...	NaN	NaN	NaN
8	spam	WINNER!! As a valued network customer you have...	NaN	NaN	NaN
9	spam	Had your mobile 11 months or more? U R entitle...	NaN	NaN	NaN
10	ham	I'm gonna be home soon and i don't want to tal...	NaN	NaN	NaN
11	spam	SIX chances to win CASH! From 100 to 20,000 po...	NaN	NaN	NaN
12	spam	URGENT! You have won a 1 week FREE membership ...	NaN	NaN	NaN
13	ham	I've been searching for the right words to tha...	NaN	NaN	NaN
14	ham	I HAVE A DATE ON SUNDAY WITH WILL!!	NaN	NaN	NaN
15	spam	XXXMobileMovieClub: To use your credit, click ...	NaN	NaN	NaN
16	ham	Oh k...i'm watching here:)	NaN	NaN	NaN
17	ham	Eh u remember how 2 spell his name... Yes i di...	NaN	NaN	NaN
18	ham	Fine if thatåÕs the way u feel. ThatåÕs the wa...	NaN	NaN	NaN
19	spam	England v Macedonia - dont miss the goals/team...	NaN	NaN	NaN

	spam	ham
Word
to	521	1192
i	32	1658
you	222	1327
a	287	780
the	150	830
...	...	...
surrounded	0	1
cage	0	1
paces	0	1
sink	0	1
hrishi	0	1

	P(E\|S)	P(E\|¬S)
Word
to	0.936266	0.329056
i	0.058348	0.457643
you	0.399461	0.366308
a	0.516158	0.215370
the	0.270197	0.229167

Introduction

$P\left(S\middle|\ E\right)=\frac{P\left(E\middle|\ S\right)P\left(S\right)}{P\left(E\middle|\ S\right)P\left(S\right)+P\left(E|\lnot S\right)P\left(\lnot S\right)}$

The Implementation

1. Read the dataset into a dataframe and explore

2. Clean the data

3. Split the Data

4. Create a Word Frequency DataFrame

5. Visualise the Data

6. Calculate $P\left(E\middle| S\right)$ and $P\left(E|\lnot S\right)$

7. Checking the 'spamliness' of a single word

8. Checking the 'spamliness' of several words

9. Avoiding floating point underflow

10. Testing the Model

11. Improvements

	Category	Message
0	ham	go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat until
1	ham	ok lar joking wif u oni
2	spam	free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
3	ham	u dun say so early hor u c already then say
4	ham	nah i dont think he goes to usf he lives around here though