Not Given Label
1、Compactness(紧密性)(CP)
代码实现
# -*- coding: utf-8 -*-
'''
Created by hushiwei on 2018/10/16
Desc : 进行聚类算法的评估指标计算
Note : CP指数,SP指数,DB,DVI
'''
# 导入包
from numpy import *
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs # 导入产生模拟数据的方法
from sklearn.cluster import KMeans # 导入kmeans 类
import time
from MachineLearning.cluster_evaluate.calculate_dvi import dunn
def distEclud(vecA, vecB, axis=None):
'''
求两个向量之间的距离,计算欧几里得距离
same as : np.linalg.norm(np.asarray(datas[0].iloc[0].values) - np.asarray(datas[0].iloc[1].values))
:param vecA: 中心点
:param vecB: 数据点
:param axis: np.sum的参数
:return:
'''
return np.sqrt(np.sum(np.power(vecA - vecB, 2), axis=axis))
def calculate_cp_i(cluster_center, cluster_point):
'''
计算聚类类各点到聚类中心的平均距离
:param cluster_center: 聚类中心点
:param cluster_point: 聚类样本点
:return:
'''
return np.average(distEclud(cluster_center, cluster_point, axis=1))
def calculate_cp(cluster_centers, cluster_points):
'''
CP,计算每一个类各点到聚类中心的平均距离
CP越低意味着类内聚类距离越近
:param cluster_centers: 聚类中心点
:param cluster_points: 聚类样本点
:return:
'''
cps = [calculate_cp_i(cluster_centers[i], cluster_points[i]) for i in arange(len(cluster_centers))]
cp = np.average(cps)
return cp
def calculate_sp(cluster_centers):
'''
SP,计算各聚类中心两两之间的平均距离
SP越高意味类间聚类距离越远
:param cluster_centers:
:return:
'''
k = len(cluster_centers)
res = []
for i in arange(k):
tmp = []
for j in arange(i + 1, k):
tmp.append(distEclud(cluster_centers[i], cluster_centers[j]))
res.append(np.sum(tmp))
sp = (2 / (k * k - k)) * np.sum(res)
return sp
def calculate_db(cluster_centers, cluster_points):
'''
DB,计算任意两类别的类内距离平均距离(CP)之和除以两聚类中心距离求最大值
DB越小意味着类内距离越小 同时类间距离越大
:param cluster_centers: 聚类中心点
:param cluster_points: 聚类样本点
:return:
'''
n_cluster = len(cluster_centers)
cps = [calculate_cp_i(cluster_centers[i], cluster_points[i]) for i in arange(n_cluster)]
db = []
for i in range(n_cluster):
for j in range(n_cluster):
if j != i:
db.append((cps[i] + cps[j]) / distEclud(cluster_centers[i], cluster_centers[j]))
db = (np.max(db) / n_cluster)
return db
def calculate_dvi(cluster_points):
'''
DVI计算,任意两个簇元素的最短距离(类间)除以任意簇中的最大距离(类内).
DVI越大意味着类间距离越大 同时类内距离越小。
:param cluster_points:
:return:
'''
# calcuation of maximum distance
d1 = []
d2 = []
d3 = []
# 类内最大距离
for k, cluster_point in cluster_points.items():
# 遍历每一个簇中每一个元素
for i in range(len(cluster_point)):
temp1 = cluster_point.iloc[i].values
for j in range(len(cluster_point)):
temp2 = cluster_point.iloc[j].values
# 求簇内两元素的距离
dist = distEclud(temp1, temp2)
d1.insert(j, dist)
d2.insert(i, max(d1))
d1 = []
d3.insert(k, max(d2))
d2 = []
xmax = max(d3)
# calcuation of minimun distance
d1 = []
d2 = []
d3 = []
d4 = []
# 类间最小距离
for k, cluster_point in cluster_points.items():
# 遍历每一个簇中每一个元素
for j in range(len(cluster_point)):
temp1 = cluster_point.iloc[j].values
for key in cluster_points.keys():
if not key == k:
other_cluster_df = cluster_points[key]
for index in range(len(other_cluster_df)):
temp2 = other_cluster_df.iloc[index].values
dist = distEclud(temp1, temp2)
d1.insert(index, dist)
d2.insert(key, min(d1))
d1 = []
d3.insert(j, min(d2))
d2 = []
d4.insert(k, min(d3))
xmin = min(d4)
dunn_index = xmin / xmax
return dunn_index
if __name__ == '__main__':
# 1. 产生模拟数据
N = 1000
centers = 4
X, Y = make_blobs(n_samples=N, n_features=2, centers=centers, random_state=28)
# df = pd.DataFrame(X, columns=list('abcdefghij'))
# df.to_csv(
# '/Users/hushiwei/PycharmProjects/gai_platform/data/data_spliter/local_debug_data/cluster_data/cluster_data.csv',
# sep=',', index=False)
# sys.exit()
# 2. 模型构建
km = KMeans(n_clusters=centers, init='random', random_state=28)
km.fit(X)
# 模型的预测
y_hat = km.predict(X[:10])
print(y_hat)
print("所有样本距离所属簇中心点的总距离和为:%.5f" % km.inertia_)
print("所有样本距离所属簇中心点的平均距离为:%.5f" % (km.inertia_ / N))
print("所有的中心点聚类中心坐标:")
cluster_centers = km.cluster_centers_
print(cluster_centers)
print("score其实就是所有样本点离所属簇中心点距离和的相反数:")
print(km.score(X))
# 统计各个聚簇点的类别数目
r1 = pd.Series(km.labels_).value_counts()
kmlabels = km.labels_ # 得到类别,label_ 是内部变量
X = pd.DataFrame(X, columns=list('ab'))
r = pd.concat([X, pd.Series(kmlabels, index=X.index)], axis=1) # 详细输出每个样本对应的类别,横向连接(0是纵向),得到聚类中心对应的类别下的数目
r.columns = list(X.columns) + ['class'] # 重命名表头 加一列的表头
print(r.head())
# 根据聚类信息,将数据进行分类{聚类id:聚类点}
cluster_centers = {k: value for k, value in enumerate(cluster_centers)}
cluster_points = {label: r[r['class'] == label].drop(columns=['class']) for label in np.unique(km.labels_)}
print('~' * 10, 'evaluate kmeans', '~' * 10)
cp = calculate_cp(cluster_centers, cluster_points)
print('cp : ', cp)
sp = calculate_sp(cluster_centers)
print('sp : ', sp)
dp = calculate_db(cluster_centers, cluster_points)
print('dp : ', dp)
start=time.time()
# dvi = calculate_dvi(cluster_points)
# print('dvi : ', dvi)
a = [point.values for point in cluster_points.values()]
di=dunn(a)
print('di : ',di)
print("dvi cost ",time.time()-start)
dvi的计算,代码2
# -*- coding: utf-8 -*-
'''
Created by hushiwei on 2018/10/23
Desc : 计算dvi
Note : 这个和那个,有区别
'''
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def delta(ck, cl):
values = np.ones([len(ck), len(cl)]) * 10000
for i in range(0, len(ck)):
for j in range(0, len(cl)):
values[i, j] = np.linalg.norm(ck[i] - cl[j])
return np.min(values)
def big_delta(ci):
values = np.zeros([len(ci), len(ci)])
for i in range(0, len(ci)):
for j in range(0, len(ci)):
values[i, j] = np.linalg.norm(ci[i] - ci[j])
return np.max(values)
def dunn(k_list):
""" Dunn index [CVI]
Parameters
----------
k_list : list of np.arrays
A list containing a numpy array for each cluster |c| = number of clusters
c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
"""
deltas = np.ones([len(k_list), len(k_list)]) * 1000000
big_deltas = np.zeros([len(k_list), 1])
l_range = list(range(0, len(k_list)))
for k in l_range:
for l in (l_range[0:k] + l_range[k + 1:]):
deltas[k, l] = delta(k_list[k], k_list[l])
big_deltas[k] = big_delta(k_list[k])
di = np.min(deltas) / np.max(big_deltas)
return di
def delta_fast(ck, cl, distances):
values = distances[np.where(ck)][:, np.where(cl)]
values = values[np.nonzero(values)]
return np.min(values)
def big_delta_fast(ci, distances):
values = distances[np.where(ci)][:, np.where(ci)]
values = values[np.nonzero(values)]
return np.max(values)
def dunn_fast(points, labels):
""" Dunn index - FAST (using sklearn pairwise euclidean_distance function)
Parameters
----------
points : np.array
np.array([N, p]) of all points
labels: np.array
np.array([N]) labels of all points
"""
distances = euclidean_distances(points)
ks = np.sort(np.unique(labels))
deltas = np.ones([len(ks), len(ks)]) * 1000000
big_deltas = np.zeros([len(ks), 1])
l_range = list(range(0, len(ks)))
for k in l_range:
for l in (l_range[0:k] + l_range[k + 1:]):
deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances)
big_deltas[k] = big_delta_fast((labels == ks[k]), distances)
di = np.min(deltas) / np.max(big_deltas)
return di
def big_s(x, center):
len_x = len(x)
total = 0
for i in range(len_x):
total += np.linalg.norm(x[i] - center)
return total / len_x
def davisbouldin(k_list, k_centers):
""" Davis Bouldin Index
Parameters
----------
k_list : list of np.arrays
A list containing a numpy array for each cluster |c| = number of clusters
c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
k_centers : np.array
The array of the cluster centers (prototypes) of type np.array([K, p])
"""
len_k_list = len(k_list)
big_ss = np.zeros([len_k_list], dtype=np.float64)
d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64)
db = 0
for k in range(len_k_list):
big_ss[k] = big_s(k_list[k], k_centers[k])
for k in range(len_k_list):
for l in range(0, len_k_list):
d_eucs[k, l] = np.linalg.norm(k_centers[k] - k_centers[l])
for k in range(len_k_list):
values = np.zeros([len_k_list - 1], dtype=np.float64)
for l in range(0, k):
values[l] = (big_ss[k] + big_ss[l]) / d_eucs[k, l]
for l in range(k + 1, len_k_list):
values[l - 1] = (big_ss[k] + big_ss[l]) / d_eucs[k, l]
db += np.max(values)
res = db / len_k_list
return res