3.
To construct a part of speech tagger using Hidden markov model and implement the viterbi algorithm to decode the
most probable sentence
# Simple POS Tagger using HMM + Viterbi
# Training data: (word, tag) pairs
training_data = [
[('the', 'DET'), ('cat', 'NOUN'), ('sat', 'VERB')],
[('the', 'DET'), ('dog', 'NOUN'), ('barked', 'VERB')],
[('a', 'DET'), ('cat', 'NOUN'), ('meowed', 'VERB')],
from collections import defaultdict
import math
# Step 1: Count frequencies
tags = set()
start_counts = defaultdict(int)
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
for sentence in training_data:
prev_tag = None
for i, (word, tag) in enumerate(sentence):
[Link](tag)
emission_counts[tag][word] += 1
if i == 0:
start_counts[tag] += 1
if prev_tag:
transition_counts[prev_tag][tag] += 1
prev_tag = tag
# Convert counts to probabilities
def normalize(counts):
total = sum([Link]())
return {k: v / total for k, v in [Link]()}
start_probs = normalize(start_counts)
transition_probs = {t: normalize(transition_counts[t]) for t in tags}
emission_probs = {t: normalize(emission_counts[t]) for t in tags}
# Step 2: Viterbi algorithm
def viterbi(sentence):
V = [{}]
path = {}
# Initialization
for tag in tags:
V[0][tag] = [Link](start_probs.get(tag, 1e-6)) + \
[Link](emission_probs[tag].get(sentence[0], 1e-6))
path[tag] = [tag]
# Recursion
for t in range(1, len(sentence)):
[Link]({})
new_path = {}
for curr_tag in tags:
(prob, best_prev) = max(
(V[t-1][prev_tag] + [Link](transition_probs[prev_tag].get(curr_tag, 1e-6)) +
[Link](emission_probs[curr_tag].get(sentence[t], 1e-6)), prev_tag)
for prev_tag in tags
V[t][curr_tag] = prob
new_path[curr_tag] = path[best_prev] + [curr_tag]
path = new_path
# Termination
(prob, final_tag) = max((V[-1][tag], tag) for tag in tags)
return path[final_tag]
# Test
test_sentence = ['the', 'cat', 'barked']
print("Sentence:", test_sentence)
print("Predicted Tags:", viterbi(test_sentence))