-
Notifications
You must be signed in to change notification settings - Fork 0
/
Naive_Bayes.py
83 lines (60 loc) · 2.12 KB
/
Naive_Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import numpy as np
# Sample training data
data = [
("This is a spam email", "spam"),
("Buy one get one free", "spam"),
("Hello, how are you?", "ham"),
("Congratulations, you've won a prize!", "spam"),
("Meeting at 3 PM", "ham"),
("Get a discount on your next purchase", "spam"),
]
# Preprocess the training data
word_set = set()
for text, label in data:
words = re.findall(r"\w+", text.lower())
word_set.update(words)
word_list = list(word_set)
word_list.sort()
# Create a vocabulary
vocab = {word: index for index, word in enumerate(word_list)}
# Initialize counts for spam and ham
spam_count = sum(1 for _, label in data if label == "spam")
ham_count = sum(1 for _, label in data if label == "ham")
# Count the occurrences of words in spam and ham messages
spam_word_count = np.zeros(len(vocab))
ham_word_count = np.zeros(len(vocab))
# Populate the counts
for text, label in data:
words = re.findall(r"\w+", text.lower())
label_count = spam_word_count if label == "spam" else ham_word_count
for word in words:
if word in vocab:
word_index = vocab[word]
label_count[word_index] += 1
# Calculate the prior probabilities
total_messages = len(data)
prior_spam = spam_count / total_messages
prior_ham = ham_count / total_messages
# Input text to classify
input_text = "You've won a free vacation!"
# Tokenize and process the input text
input_words = re.findall(r"\w+", input_text.lower())
# Calculate likelihoods and apply the Naive Bayes formula
likelihood_spam = 1.0
likelihood_ham = 1.0
for word in input_words:
if word in vocab:
word_index = vocab[word]
likelihood_spam *= (spam_word_count[word_index] + 1) / (spam_count + len(vocab))
likelihood_ham *= (ham_word_count[word_index] + 1) / (ham_count + len(vocab))
# Apply Bayes' theorem
posterior_spam = (likelihood_spam * prior_spam) / (
(likelihood_spam * prior_spam) + (likelihood_ham * prior_ham)
)
posterior_ham = 1 - posterior_spam
# Make a classification decision
if posterior_spam > posterior_ham:
print("Classified as: Spam")
else:
print("Classified as: Ham")