svm推导,详见pluskid博客。
基本推导
smo优化
α1,α2的取值在如图所示的直线上。假定α2的上限下限分别是H,L
当y1≠y2时,α1−α2=k
α1−α2=C−H 所以: H=C−α1+α2
α1−α2=0−L 所以: L=α2−α1
同理,
当y1==y2时, α1+α2=k
α1+α2=0+H, 所以,H=α1+α2
α1+α2=C+L, 所以,L=α1+α2−C
αnew2=α2+y2(E1−E2)η其中
η=k(x1,x2)+k(x2,x2)−2k(x1,x2) Ei=predicti−yi因为,α2必须在L,H的范围内,所以如果超出范围需要进行截断clip. αnew,clip2={Hif αnew2≥Hαnew2if L<αnew2<HLαnew2≤L α1与α2是线性关系,所以,
αnew1−α1=y1y2(αnew,clip2−α2)所以,
αnew1=α1+y1y2(αnew,clip2−α2)求b:其实就是希望b的取值能够使得当前的预测值与标签相同。
f(x)=N∑j=1yjαjK(xj,x)−b记yjα1K(xj,x)=sj
所以,f(x1)=s1+s2+constant−b
f(x1)new=snew1+snew2+constant−bnew
求差值: f(x1)new−f(x1)=snew1−s1+snew2−s2−(bnew−b)
我们希望这个这个差值能够弥补之前的预测误差,即,
E1+f(x1)new−f(x1)=E1+snew1−s1+snew2−s2−(bnew−b)=0
从上式子可以求出b的一个取值b1。
同理,对于样本x2同样可以获得一个取值b2.
最后采用策略选取b1,b2.
- 如果a1在边界上,则取b1
- 否则,如果a2在边界上,则取b2
- 否则,取b1,b2的均值。
代码
# coding:utf-8
from mla.base import BaseEstimator
from mla.svm.kernerls import Linear
import numpy as np
import logging
np.random.seed(9999)
"""
References:
The Simplified SMO Algorithm http://cs229.stanford.edu/materials/smo.pdf
"""
class SVM(BaseEstimator):
def __init__(self, C=1.0, kernel=None, tol=1e-3, max_iter=100):
"""Support vector machines implementation using simplified SMO optimization.
Parameters
----------
C : float, default 1.0
kernel : Kernel object
tol : float , default 1e-3
max_iter : int, default 100
"""
self.C = C
self.tol = tol
self.max_iter = max_iter
if kernel is None:
self.kernel = Linear()
else:
self.kernel = kernel
self.b = 0
self.alpha = None
self.K = None
def fit(self, X, y=None):
self._setup_input(X, y)
self.K = np.zeros((self.n_samples, self.n_samples))
for i in range(self.n_samples):
self.K[:, i] = self.kernel(self.X, self.X[i, :])
self.alpha = np.zeros(self.n_samples)
self.sv_idx = np.arange(0, self.n_samples)
return self._train()
def _train(self):
iters = 0
while iters < self.max_iter:
iters += 1
alpha_prev = np.copy(self.alpha)
for j in range(self.n_samples):
# Pick random i
i = self.random_index(j)
eta = 2.0 * self.K[i, j] - self.K[i, i] - self.K[j, j]
if eta >= 0:
continue
L, H = self._find_bounds(i, j)
# Error for current examples
e_i, e_j = self._error(i), self._error(j)
# Save old alphas
alpha_io, alpha_jo = self.alpha[i], self.alpha[j]
# Update alpha
self.alpha[j] -= (self.y[j] * (e_i - e_j)) / eta
self.alpha[j] = self.clip(self.alpha[j], H, L)
self.alpha[i] = self.alpha[i] + self.y[i] * self.y[j] * (alpha_jo - self.alpha[j])
# Find intercept
b1 = self.b - e_i - self.y[i] * (self.alpha[i] - alpha_jo) * self.K[i, i] - \
self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
b2 = self.b - e_j - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j] - \
self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
if 0 < self.alpha[i] < self.C:
self.b = b1
elif 0 < self.alpha[j] < self.C:
self.b = b2
else:
self.b = 0.5 * (b1 + b2)
# Check convergence
diff = np.linalg.norm(self.alpha - alpha_prev)
if diff < self.tol:
break
logging.info('Convergence has reached after %s.' % iters)
# Save support vectors index
self.sv_idx = np.where(self.alpha > 0)[0]
def _predict(self, X=None):
n = X.shape[0]
result = np.zeros(n)
for i in range(n):
result[i] = np.sign(self._predict_row(X[i, :]))
return result
def _predict_row(self, X):
k_v = self.kernel(self.X[self.sv_idx], X)
return np.dot((self.alpha[self.sv_idx] * self.y[self.sv_idx]).T, k_v.T) + self.b
def clip(self, alpha, H, L):
if alpha > H:
alpha = H
if alpha < L:
alpha = L
return alpha
def _error(self, i):
"""Error for single example."""
return self._predict_row(self.X[i]) - self.y[i]
def _find_bounds(self, i, j):
"""Find L and H such that L <= alpha <= H.
Also, alpha must satisfy the constraint 0 <= αlpha <= C.
"""
if self.y[i] != self.y[j]:
L = max(0, self.alpha[j] - self.alpha[i])
H = min(self.C, self.C - self.alpha[i] + self.alpha[j])
else:
L = max(0, self.alpha[i] + self.alpha[j] - self.C)
H = min(self.C, self.alpha[i] + self.alpha[j])
return L, H
def random_index(self, z):
i = z
while i == z:
i = np.random.randint(0, self.n_samples - 1)
return i
参考:
http://blog.pluskid.org/?page_id=683
http://cs229.stanford.edu/notes/cs229-notes3.pdf
http://cs229.stanford.edu/materials/smo.pdf
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf