Machine_Learning_Models/Logistic_Regression.py at main · JacobGH2/Machine_Learning_Models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""Assignment2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1rvEL1SEgzMO-40sZRxejM0A_mphdjwHF
"""

pip install ucimlrepo

import matplotlib.pyplot as plt
import random
import math

from ucimlrepo import fetch_ucirepo

# fetch dataset
iris = fetch_ucirepo(id=53)

# data (as pandas dataframes)
features = iris.data.features
targets = iris.data.targets

# Question 3
# extract sepal length and sepal width data from dataset for setosa and versicolor
sepLSet = features['sepal length'].values[0:50]
sepWSet = features['sepal width'].values[0:50]
sepLVer = features['sepal length'].values[50:100]
sepWVer = features['sepal width'].values[50:100]

# create combined input lists
input1s = sepLSet.tolist() + sepLVer.tolist()
input2s = sepWSet.tolist() + sepWVer.tolist()

# plot sepal length/width graph
plt.title("Setosa and Versicolor")
plt.xlabel("sepal length")
plt.ylabel("sepal width")
setosa = plt.scatter(sepLSet, sepWSet, c='g', label="Setosa")
versicolor = plt.scatter(sepLVer, sepWVer, c='r', label="Versicolor")
plt.legend(handles=[setosa, versicolor])

# Question 5
# model function
def logistic(w, i):
    return 1/(1+math.exp(-(w[0]+w[1]*i[0]+w[2]*i[1])))

# update rule (stochastic)
def update(pred, i, y): # loop will calculate prediction, which is used to update all 3 weights
    weights[0] = weights[0]+l_rate*(y-pred)*pred*(1-pred)*1
    weights[1] = weights[1]+l_rate*(y-pred)*pred*(1-pred)*i[0]
    weights[2] = weights[2]+l_rate*(y-pred)*pred*(1-pred)*i[1]

# iterate over all training samples
def epoch():
    success_count = 0   # setosa is success, probability > .5
    for ind in train_ind:
      inputs = [input1s[ind], input2s[ind]]
      it_pred = logistic(weights, inputs)
      success = 0
      if (it_pred < .5 and ind > 49): # first 50 elements are setosa (success)
        success_count += 1
      elif (it_pred >= .5 and ind <= 49):
        success_count += 1
      else:  # update weights if not a success
          update(it_pred, inputs, 0 if ind>49 else 1)

    return success_count/len(train_ind)

# run num_ep epochs and report accuracy rate
def train(num_ep):
    for i in range(num_ep):
        success_count_list.append(epoch())

# plot accuracy rate across all epochs
def report():
    print("Final weight vector:", weights)
    x = range(1, len(success_count_list)+1)
    plt.title("Training Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("% Correct")
    plt.xticks(x)
    plt.plot(x, [x * 100 for x in success_count_list])

# determine accuracy rate for test data
def test():
    test_success_count = 0
    for ind in test_ind:
        setosa = 1
        if (ind > 49): setosa = 0
        inputs = [input1s[ind], input2s[ind]]
        test_pred = logistic(weights, inputs)
        if (test_pred >= .5 and setosa == 1): test_success_count+=1
        if (test_pred < .5 and setosa == 0): test_success_count+=1
    return test_success_count/len(test_ind)

# Question 4
# randomly assign indices of samples in training batch and test batch
train_ind = []
while len(train_ind) < 80:
  r = random.randint(0,99)
  if r not in train_ind:
    train_ind.append(r)

all_ind = list(range(0,100))
s = set(train_ind)
test_ind = [num for num in all_ind if num not in s]

print("Indices in training set:", train_ind)
print("Indices in test set:", test_ind)

# Question 6
# train and report accuracy across epochs

# initialization
weights = [.5,.5,.5]
l_rate = .01
success_count_list = []

# training
print("Initial weight vector", weights)
train(15)
report()
print("Test accuracy: " + str(test()*100) + "%")

"""Summary: I trained my model to classify a sample as either Setosa or not Setosa (Versicolor). The features used as inputs were sepal length and sepal width. An output of 0.5 or greater from the logistic regression model was treated as a positive identification of a Setosa sample."""