# 监督学习分类之高斯朴素贝叶斯(Gaussian Naive Bayes)

2022/05/13 11:39

$P(y|x_1,\dots,x_n) = \frac{P(y)\prod_i^nP(x_i|y)}{P(x_1,\dots,x_n)}$

顾客          性别          宠物            是否复购
A             男            猫               是
B             男            狗               是
C             女            狗               是
D             男            狗               是
E             女            猫               否
F             女            狗               否
G             女            猫               否

$P(复购|女,猫)=\frac{P(复购)\times P(女|复购)\times P(猫|复购)}{P(女,猫)}$

$P(复购|女, 猫)=\frac{4}{7}\times\frac{1}{4}\times\frac{1}{4}/\frac{12}{49}=\frac{7}{48}\simeq14.6\%$

$\begin{eqnarray} \qquad P(不复购|女,猫)&=&\frac{P(不复购)\times P(女|不复购)\times P(猫|不复购)}{P(女,猫)}\\ &=&\frac{3}{7}\times 1\times\frac{2}{3}/\frac{12}{49}=\frac{7}{6}\simeq117\% \end{eqnarray}$

$P(x_i|y)=\frac{1}{\sqrt{2\pi}\sigma_y}\mathrm{exp}\left(-\frac{(x_i-\mu_y)^2}{2\sigma_y^2}\right)$

naive_bayes.py

# -*- coding: utf-8 -*-
import numpy as np
import numbers
from scipy.special import logsumexp

from itertools import chain
from collections.abc import Sequence

__all__ = ['GaussianNB']

def _check_sample_weight(sample_weight, X, dtype=None):
"""Validate sample weights.

Note that passing sample_weight=None will output an array of ones.
Therefore, in some cases, you may want to protect the call with:
if sample_weight is not None:
sample_weight = _check_sample_weight(...)

Parameters
----------
sample_weight : {ndarray, Number or None}, shape (n_samples,)
Input sample weights.

X : nd-array, list or sparse matrix
Input data.

dtype: dtype
dtype of the validated sample_weight.
If None, and the input sample_weight is an array, the dtype of the
input is preserved; otherwise an array with the default numpy dtype
is be allocated.  If dtype is not one of float32, float64,
None, the output will be of dtype float64.

Returns
-------
sample_weight : ndarray, shape (n_samples,)
Validated sample weight. It is guaranteed to be "C" contiguous.
"""
n_samples = len(X)

if dtype is not None and dtype not in [np.float32, np.float64]:
dtype = np.float64

if sample_weight is None:
sample_weight = np.ones(n_samples, dtype=dtype)
elif isinstance(sample_weight, numbers.Number):
sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
else:
if dtype is None:
dtype = [np.float64, np.float32]
if sample_weight.ndim != 1:
raise ValueError("Sample weights must be 1D array or scalar")

if sample_weight.shape != (n_samples,):
raise ValueError("sample_weight.shape == {}, expected {}!"
.format(sample_weight.shape, (n_samples,)))
return sample_weight

def _unique_multiclass(y):
if hasattr(y, '__array__'):
return np.unique(np.asarray(y))
else:
return set(y)

def _unique_indicator(y):
return np.arange(
y.shape[1]
)

_FN_UNIQUE_LABELS = {
'binary': _unique_multiclass,
'multiclass': _unique_multiclass,
'multilabel-indicator': _unique_indicator,
}

def _is_integral_float(y):
return y.dtype.kind == 'f' and np.all(y.astype(int) == y)

def is_multilabel(y):
""" Check if y is in a multilabel format.

Parameters
----------
y : numpy array of shape [n_samples]
Target values.

Returns
-------
out : bool,
Return True, if y is in a multilabel format, else False.

Examples
--------
>>> import numpy as np
>>> from sklearn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1])
False
>>> is_multilabel([[1], [0, 2], []])
False
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
True
>>> is_multilabel(np.array([[1], [0], [0]]))
False
>>> is_multilabel(np.array([[1, 0, 0]]))
True
"""
if hasattr(y, '__array__') or isinstance(y, Sequence):
y = np.asarray(y)
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
return False

labels = np.unique(y)

return len(labels) < 3 and (y.dtype.kind in 'biu' or  # bool, int, uint
_is_integral_float(labels))

def type_of_target(y):
"""Determine the type of data indicated by the target.

Note that this type is the most specific type that can be inferred.
For example:

* binary is more specific but compatible with multiclass.
* multiclass of integers is more specific but compatible with
continuous.
* multilabel-indicator is more specific but compatible with
multiclass-multioutput.

Parameters
----------
y : array-like

Returns
-------
target_type : string
One of:

* 'continuous': y is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': y is a 2d array of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': y contains <= 2 discrete values and is 1d or a column
vector.
* 'multiclass': y contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': y is a 2d array that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': y is a label indicator matrix, an array
of two dimensions with at least two columns, and at most 2 unique
values.
* 'unknown': y is array-like but none of the above, such as a 3d
array, sequence of sequences, or an array of non-sequence objects.

Examples
--------
>>> import numpy as np
>>> type_of_target([0.1, 0.6])
'continuous'
>>> type_of_target([1, -1, -1, 1])
'binary'
>>> type_of_target(['a', 'b', 'a'])
'binary'
>>> type_of_target([1.0, 2.0])
'binary'
>>> type_of_target([1, 0, 2])
'multiclass'
>>> type_of_target([1.0, 0.0, 3.0])
'multiclass'
>>> type_of_target(['a', 'b', 'c'])
'multiclass'
>>> type_of_target(np.array([[1, 2], [3, 1]]))
'multiclass-multioutput'
>>> type_of_target([[1, 2]])
'multilabel-indicator'
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
'continuous-multioutput'
>>> type_of_target(np.array([[0, 1], [1, 1]]))
'multilabel-indicator'
"""
if is_multilabel(y):
return 'multilabel-indicator'

try:
y = np.asarray(y)
except ValueError:
# Known to fail in numpy 1.3 for array of arrays
return 'unknown'

# The old sequence of sequences format
try:
if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
and not isinstance(y[0], str)):
raise ValueError('You appear to be using a legacy multi-label data'
' representation. Sequence of sequences are no'
' longer supported; use a binary array or sparse'
' matrix instead - the MultiLabelBinarizer'
' transformer can convert to this format.')
except IndexError:
pass

# Invalid inputs
if y.ndim > 2 or (y.dtype == object and len(y) and
not isinstance(y.flat[0], str)):
return 'unknown'  # [[[1, 2]]] or [obj_1] and not ["label_1"]

if y.ndim == 2 and y.shape[1] == 0:
return 'unknown'  # [[]]

if y.ndim == 2 and y.shape[1] > 1:
suffix = "-multioutput"  # [[1, 2], [1, 2]]
else:
suffix = ""  # [1, 2, 3] or [[1], [2], [3]]

# check float and contains non-integer float values
if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
return 'continuous' + suffix

if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
return 'multiclass' + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
else:
return 'binary'  # [1, 2] or [["a"], ["b"]]

def unique_labels(*ys):
"""Extract an ordered array of unique labels

We don't allow:
- mix of multilabel and multiclass (single label) targets
- mix of label indicator matrix and anything else,
because there are no explicit labels)
- mix of label indicator matrices of different sizes
- mix of string and integer labels

At the moment, we also don't allow "multiclass-multioutput" input type.

Parameters
----------
*ys : array-likes

Returns
-------
out : numpy array of shape [n_unique_labels]
An ordered array of unique labels.

Examples
--------
>>> from sklearn.utils.multiclass import unique_labels
>>> unique_labels([3, 5, 5, 5, 7, 7])
array([3, 5, 7])
>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
array([1, 2, 3, 4])
>>> unique_labels([1, 2, 10], [5, 11])
array([ 1,  2,  5, 10, 11])
"""
if not ys:
raise ValueError('No argument has been passed.')
# Check that we don't mix label format

ys_types = set(type_of_target(x) for x in ys)
if ys_types == {"binary", "multiclass"}:
ys_types = {"multiclass"}

if len(ys_types) > 1:
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)

label_type = ys_types.pop()

# Check consistency for the indicator format
if (label_type == "multilabel-indicator" and
len(set(y.shape[1]
for y in ys)) > 1):
raise ValueError("Multi-label binary indicator input with "
"different numbers of labels")

# Get the unique set of labels
_unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
if not _unique_labels:
raise ValueError("Unknown label type: %s" % repr(ys))

ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))

# Check that we don't mix string type with number type
if (len(set(isinstance(label, str) for label in ys_labels)) > 1):
raise ValueError("Mix of label input types (string and number)")

return np.array(sorted(ys_labels))

def _check_partial_fit_first_call(clf, classes=None):
"""Private helper function for factorizing common classes param logic

Estimators that implement the partial_fit API need to be provided with
the list of possible classes at the first call to partial_fit.

Subsequent calls to partial_fit should check that classes is still
consistent with a previous value of clf.classes_ when provided.

This function returns True if it detects that this was the first call to
partial_fit on clf. In that case the classes_ attribute is also
set on clf.

"""
if getattr(clf, 'classes_', None) is None and classes is None:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")

elif classes is not None:
if getattr(clf, 'classes_', None) is not None:
if not np.array_equal(clf.classes_, unique_labels(classes)):
raise ValueError(
"classes=%r is not the same as on last call "
"to partial_fit, was: %r" % (classes, clf.classes_))

else:
# This is the first call to partial_fit
clf.classes_ = unique_labels(classes)
return True

# classes is None and clf.classes_ has already previously been set:
# nothing to do
return False

class _BaseNB:
"""Abstract base class for naive Bayes estimators"""

def _joint_log_likelihood(self, X):
"""Compute the unnormalized posterior log probability of X

I.e. log P(c) + log P(x|c) for all rows x of X, as an array-like of
shape (n_classes, n_samples).

Input is passed to _joint_log_likelihood as-is by predict,
predict_proba and predict_log_proba.
"""

def _check_X(self, X):
"""To be overridden in subclasses with the actual checks."""
# Note that this is not marked @abstractmethod as long as the
# deprecated public alias sklearn.naive_bayes.BayesNB exists
# (until 0.24) to preserve backward compat for 3rd party projects
# with existing derived classes.
return X

def predict(self, X):
"""
Perform classification on an array of test vectors X.

Parameters
----------
X : array-like of shape (n_samples, n_features)

Returns
-------
C : ndarray of shape (n_samples,)
Predicted target values for X
"""
X = self._check_X(X)
jll = self._joint_log_likelihood(X)
return self.classes_[np.argmax(jll, axis=1)]

def predict_log_proba(self, X):
"""
Return log-probability estimates for the test vector X.

Parameters
----------
X : array-like of shape (n_samples, n_features)

Returns
-------
C : array-like of shape (n_samples, n_classes)
Returns the log-probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute :term:classes_.
"""
X = self._check_X(X)
jll = self._joint_log_likelihood(X)
# normalize by P(x) = P(f_1, ..., f_n)
log_prob_x = logsumexp(jll, axis=1)
return jll - np.atleast_2d(log_prob_x).T

def predict_proba(self, X):
"""
Return probability estimates for the test vector X.

Parameters
----------
X : array-like of shape (n_samples, n_features)

Returns
-------
C : array-like of shape (n_samples, n_classes)
Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute :term:classes_.
"""
return np.exp(self.predict_log_proba(X))

class GaussianNB(_BaseNB):
"""
Gaussian Naive Bayes (GaussianNB)

Can perform online updates to model parameters via :meth:partial_fit.
For details on algorithm used to update feature means and variance online,
see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

Read more in the :ref:User Guide <gaussian_naive_bayes>.

Parameters
----------
priors : array-like of shape (n_classes,)
Prior probabilities of the classes. If specified the priors are not

var_smoothing : float, default=1e-9
Portion of the largest variance of all features that is added to
variances for calculation stability.

Attributes
----------
class_count_ : ndarray of shape (n_classes,)
number of training samples observed in each class.

class_prior_ : ndarray of shape (n_classes,)
probability of each class.

classes_ : ndarray of shape (n_classes,)
class labels known to the classifier

epsilon_ : float

sigma_ : ndarray of shape (n_classes, n_features)
variance of each feature per class

theta_ : ndarray of shape (n_classes, n_features)
mean of each feature per class

Examples
--------
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> Y = np.array([1, 1, 1, 2, 2, 2])
>>> from sklearn.naive_bayes import GaussianNB
>>> clf = GaussianNB()
>>> clf.fit(X, Y)
GaussianNB()
>>> print(clf.predict([[-0.8, -1]]))
[1]
>>> clf_pf = GaussianNB()
>>> clf_pf.partial_fit(X, Y, np.unique(Y))
GaussianNB()
>>> print(clf_pf.predict([[-0.8, -1]]))
[1]
"""

def __init__(self, *, priors=None, var_smoothing=1e-9):
self.priors = priors
self.var_smoothing = var_smoothing

def fit(self, X, y, sample_weight=None):
"""Fit Gaussian Naive Bayes according to X, y

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples
and n_features is the number of features.

y : array-like of shape (n_samples,)
Target values.

sample_weight : array-like of shape (n_samples,), default=None
Weights applied to individual samples (1. for unweighted).

Gaussian Naive Bayes supports fitting with *sample_weight*.

Returns
-------
self : object
"""
return self._partial_fit(X, y, np.unique(y), _refit=True,
sample_weight=sample_weight)

def _check_X(self, X):
return X

@staticmethod
def _update_mean_variance(n_past, mu, var, X, sample_weight=None):
"""Compute online update of Gaussian mean and variance.

Given starting sample count, mean, and variance, a new set of
points X, and optionally sample weights, return the updated mean and
variance. (NB - each dimension (column) in X is treated as independent
-- you get variance, not covariance).

Can take scalar mean and variance, or vector mean and variance to
simultaneously update a number of independent Gaussians.

See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:

http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf

Parameters
----------
n_past : int
Number of samples represented in old mean and variance. If sample
weights were given, this should contain the sum of sample
weights represented in old mean and variance.

mu : array-like of shape (number of Gaussians,)
Means for Gaussians in original set.

var : array-like of shape (number of Gaussians,)
Variances for Gaussians in original set.

sample_weight : array-like of shape (n_samples,), default=None
Weights applied to individual samples (1. for unweighted).

Returns
-------
total_mu : array-like of shape (number of Gaussians,)
Updated mean for each Gaussian over the combined set.

total_var : array-like of shape (number of Gaussians,)
Updated variance for each Gaussian over the combined set.
"""
if X.shape[0] == 0:
return mu, var

# Compute (potentially weighted) mean and variance of new datapoints
if sample_weight is not None:
n_new = float(sample_weight.sum())
new_mu = np.average(X, axis=0, weights=sample_weight)
new_var = np.average((X - new_mu) ** 2, axis=0,
weights=sample_weight)
else:
n_new = X.shape[0]
new_var = np.var(X, axis=0)
new_mu = np.mean(X, axis=0)

if n_past == 0:
return new_mu, new_var

n_total = float(n_past + n_new)

# Combine mean of old and new data, taking into consideration
# (weighted) number of observations
total_mu = (n_new * new_mu + n_past * mu) / n_total

# Combine variance of old and new data, taking into consideration
# (weighted) number of observations. This is achieved by combining
# the sum-of-squared-differences (ssd)
old_ssd = n_past * var
new_ssd = n_new * new_var
total_ssd = (old_ssd + new_ssd +
(n_new * n_past / n_total) * (mu - new_mu) ** 2)
total_var = total_ssd / n_total

def partial_fit(self, X, y, classes=None, sample_weight=None):
"""Incremental fit on a batch of samples.

This method is expected to be called several times consecutively
on different chunks of a dataset so as to implement out-of-core
or online learning.

This is especially useful when the whole dataset is too big to fit in
memory at once.

This method has some performance and numerical stability overhead,
hence it is better to call partial_fit on chunks of data that are
as large as possible (as long as fitting in the memory budget) to

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.

y : array-like of shape (n_samples,)
Target values.

classes : array-like of shape (n_classes,), default=None
List of all the classes that can possibly appear in the y vector.

Must be provided at the first call to partial_fit, can be omitted
in subsequent calls.

sample_weight : array-like of shape (n_samples,), default=None
Weights applied to individual samples (1. for unweighted).

Returns
-------
self : object
"""
return self._partial_fit(X, y, classes, _refit=False,
sample_weight=sample_weight)

def _partial_fit(self, X, y, classes=None, _refit=False,
sample_weight=None):
"""Actual implementation of Gaussian NB fitting.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and
n_features is the number of features.

y : array-like of shape (n_samples,)
Target values.

classes : array-like of shape (n_classes,), default=None
List of all the classes that can possibly appear in the y vector.

Must be provided at the first call to partial_fit, can be omitted
in subsequent calls.

_refit : bool, default=False
If true, act as though this were the first time we called
_partial_fit (ie, throw away any past fitting and start over).

sample_weight : array-like of shape (n_samples,), default=None
Weights applied to individual samples (1. for unweighted).

Returns
-------
self : object
"""
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)

# If the ratio of data variance between dimensions is too small, it
# will cause numerical errors. To address this, we artificially
# boost the variance by epsilon, a small fraction of the standard
# deviation of the largest dimension.
self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()

if _refit:
self.classes_ = None

if _check_partial_fit_first_call(self, classes):
# This is the first call to partial_fit:
# initialize various cumulative counters
n_features = X.shape[1]
n_classes = len(self.classes_)
self.theta_ = np.zeros((n_classes, n_features))
self.sigma_ = np.zeros((n_classes, n_features))

self.class_count_ = np.zeros(n_classes, dtype=np.float64)

# Initialise the class prior
# Take into account the priors
if self.priors is not None:
priors = np.asarray(self.priors)
# Check that the provide prior match the number of classes
if len(priors) != n_classes:
raise ValueError('Number of priors must match number of'
' classes.')
# Check that the sum is 1
if not np.isclose(priors.sum(), 1.0):
raise ValueError('The sum of the priors should be 1.')
# Check that the prior are non-negative
if (priors < 0).any():
raise ValueError('Priors must be non-negative.')
self.class_prior_ = priors
else:
# Initialize the priors to zeros for each class
self.class_prior_ = np.zeros(len(self.classes_),
dtype=np.float64)
else:
if X.shape[1] != self.theta_.shape[1]:
msg = "Number of features %d does not match previous data %d."
raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))
# Put epsilon back in each time
self.sigma_[:, :] -= self.epsilon_

classes = self.classes_

unique_y = np.unique(y)
unique_y_in_classes = np.in1d(unique_y, classes)

if not np.all(unique_y_in_classes):
raise ValueError("The target label(s) %s in y do not exist in the "
"initial classes %s" %
(unique_y[~unique_y_in_classes], classes))

for y_i in unique_y:
i = classes.searchsorted(y_i)
X_i = X[y == y_i, :]

if sample_weight is not None:
sw_i = sample_weight[y == y_i]
N_i = sw_i.sum()
else:
sw_i = None
N_i = X_i.shape[0]

new_theta, new_sigma = self._update_mean_variance(
self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
X_i, sw_i)

self.theta_[i, :] = new_theta
self.sigma_[i, :] = new_sigma
self.class_count_[i] += N_i

self.sigma_[:, :] += self.epsilon_

# Update if only no priors is provided
if self.priors is None:
# Empirical prior, with sample_weight taken into account
self.class_prior_ = self.class_count_ / self.class_count_.sum()

return self

def _joint_log_likelihood(self, X):
joint_log_likelihood = []
for i in range(np.size(self.classes_)):
jointi = np.log(self.class_prior_[i])
n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
(self.sigma_[i, :]), 1)
joint_log_likelihood.append(jointi + n_ij)

joint_log_likelihood = np.array(joint_log_likelihood).T
return joint_log_likelihood


gnb_test.py

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d" \
% (X_test.shape[0], (y_test != y_pred).sum()))`

0 评论
0 收藏
0