聚类算法评估指标

2018/10/16 posted in  MachineLearning

Not Given Label

1、Compactness(紧密性)(CP)

代码实现

# -*- coding: utf-8 -*-
'''
    Created by hushiwei on 2018/10/16
    Desc : 进行聚类算法的评估指标计算
    Note : CP指数,SP指数,DB,DVI
'''

# 导入包
from numpy import *
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs  # 导入产生模拟数据的方法
from sklearn.cluster import KMeans  # 导入kmeans 类
import time
from MachineLearning.cluster_evaluate.calculate_dvi import dunn

def distEclud(vecA, vecB, axis=None):
    '''
    求两个向量之间的距离,计算欧几里得距离
    same as : np.linalg.norm(np.asarray(datas[0].iloc[0].values) - np.asarray(datas[0].iloc[1].values))
    :param vecA: 中心点
    :param vecB: 数据点
    :param axis: np.sum的参数
    :return:
    '''
    return np.sqrt(np.sum(np.power(vecA - vecB, 2), axis=axis))


def calculate_cp_i(cluster_center, cluster_point):
    '''
    计算聚类类各点到聚类中心的平均距离
    :param cluster_center: 聚类中心点
    :param cluster_point: 聚类样本点
    :return:
    '''
    return np.average(distEclud(cluster_center, cluster_point, axis=1))


def calculate_cp(cluster_centers, cluster_points):
    '''
    CP,计算每一个类各点到聚类中心的平均距离
    CP越低意味着类内聚类距离越近
    :param cluster_centers: 聚类中心点
    :param cluster_points: 聚类样本点
    :return:
    '''

    cps = [calculate_cp_i(cluster_centers[i], cluster_points[i]) for i in arange(len(cluster_centers))]
    cp = np.average(cps)
    return cp


def calculate_sp(cluster_centers):
    '''
    SP,计算各聚类中心两两之间的平均距离
    SP越高意味类间聚类距离越远
    :param cluster_centers:
    :return:
    '''
    k = len(cluster_centers)
    res = []
    for i in arange(k):
        tmp = []
        for j in arange(i + 1, k):
            tmp.append(distEclud(cluster_centers[i], cluster_centers[j]))
        res.append(np.sum(tmp))

    sp = (2 / (k * k - k)) * np.sum(res)
    return sp


def calculate_db(cluster_centers, cluster_points):
    '''
    DB,计算任意两类别的类内距离平均距离(CP)之和除以两聚类中心距离求最大值
    DB越小意味着类内距离越小 同时类间距离越大
    :param cluster_centers: 聚类中心点
    :param cluster_points: 聚类样本点
    :return:
    '''
    n_cluster = len(cluster_centers)
    cps = [calculate_cp_i(cluster_centers[i], cluster_points[i]) for i in arange(n_cluster)]
    db = []

    for i in range(n_cluster):
        for j in range(n_cluster):
            if j != i:
                db.append((cps[i] + cps[j]) / distEclud(cluster_centers[i], cluster_centers[j]))
    db = (np.max(db) / n_cluster)
    return db


def calculate_dvi(cluster_points):
    '''
    DVI计算,任意两个簇元素的最短距离(类间)除以任意簇中的最大距离(类内).
    DVI越大意味着类间距离越大 同时类内距离越小。
    :param cluster_points:
    :return:
    '''
    # calcuation of maximum distance
    d1 = []
    d2 = []
    d3 = []
    # 类内最大距离
    for k, cluster_point in cluster_points.items():
        # 遍历每一个簇中每一个元素
        for i in range(len(cluster_point)):
            temp1 = cluster_point.iloc[i].values
            for j in range(len(cluster_point)):
                temp2 = cluster_point.iloc[j].values

                # 求簇内两元素的距离
                dist = distEclud(temp1, temp2)
                d1.insert(j, dist)

            d2.insert(i, max(d1))
            d1 = []

        d3.insert(k, max(d2))
        d2 = []
    xmax = max(d3)

    # calcuation of minimun distance
    d1 = []
    d2 = []
    d3 = []
    d4 = []
    # 类间最小距离
    for k, cluster_point in cluster_points.items():
        # 遍历每一个簇中每一个元素
        for j in range(len(cluster_point)):
            temp1 = cluster_point.iloc[j].values
            for key in cluster_points.keys():
                if not key == k:
                    other_cluster_df = cluster_points[key]
                    for index in range(len(other_cluster_df)):
                        temp2 = other_cluster_df.iloc[index].values
                        dist = distEclud(temp1, temp2)
                        d1.insert(index, dist)

                    d2.insert(key, min(d1))
                    d1 = []
            d3.insert(j, min(d2))
            d2 = []
        d4.insert(k, min(d3))
    xmin = min(d4)

    dunn_index = xmin / xmax
    return dunn_index


if __name__ == '__main__':
    # 1. 产生模拟数据
    N = 1000
    centers = 4
    X, Y = make_blobs(n_samples=N, n_features=2, centers=centers, random_state=28)
    # df = pd.DataFrame(X, columns=list('abcdefghij'))
    # df.to_csv(
    #     '/Users/hushiwei/PycharmProjects/gai_platform/data/data_spliter/local_debug_data/cluster_data/cluster_data.csv',
    #     sep=',', index=False)
    # sys.exit()

    # 2. 模型构建
    km = KMeans(n_clusters=centers, init='random', random_state=28)
    km.fit(X)

    # 模型的预测
    y_hat = km.predict(X[:10])
    print(y_hat)

    print("所有样本距离所属簇中心点的总距离和为:%.5f" % km.inertia_)
    print("所有样本距离所属簇中心点的平均距离为:%.5f" % (km.inertia_ / N))

    print("所有的中心点聚类中心坐标:")
    cluster_centers = km.cluster_centers_
    print(cluster_centers)

    print("score其实就是所有样本点离所属簇中心点距离和的相反数:")
    print(km.score(X))

    # 统计各个聚簇点的类别数目
    r1 = pd.Series(km.labels_).value_counts()

    kmlabels = km.labels_  # 得到类别,label_ 是内部变量
    X = pd.DataFrame(X, columns=list('ab'))
    r = pd.concat([X, pd.Series(kmlabels, index=X.index)], axis=1)  # 详细输出每个样本对应的类别,横向连接(0是纵向),得到聚类中心对应的类别下的数目

    r.columns = list(X.columns) + ['class']  # 重命名表头  加一列的表头
    print(r.head())

    # 根据聚类信息,将数据进行分类{聚类id:聚类点}
    cluster_centers = {k: value for k, value in enumerate(cluster_centers)}
    cluster_points = {label: r[r['class'] == label].drop(columns=['class']) for label in np.unique(km.labels_)}

    print('~' * 10, 'evaluate kmeans', '~' * 10)

    cp = calculate_cp(cluster_centers, cluster_points)
    print('cp : ', cp)

    sp = calculate_sp(cluster_centers)
    print('sp : ', sp)

    dp = calculate_db(cluster_centers, cluster_points)
    print('dp : ', dp)

    start=time.time()
    # dvi = calculate_dvi(cluster_points)
    # print('dvi : ', dvi)


    a = [point.values for point in cluster_points.values()]
    di=dunn(a)
    print('di : ',di)

    print("dvi cost ",time.time()-start)

dvi的计算,代码2

# -*- coding: utf-8 -*-
'''
    Created by hushiwei on 2018/10/23
    Desc : 计算dvi
    Note : 这个和那个,有区别
'''

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


def delta(ck, cl):
    values = np.ones([len(ck), len(cl)]) * 10000

    for i in range(0, len(ck)):
        for j in range(0, len(cl)):
            values[i, j] = np.linalg.norm(ck[i] - cl[j])

    return np.min(values)


def big_delta(ci):
    values = np.zeros([len(ci), len(ci)])

    for i in range(0, len(ci)):
        for j in range(0, len(ci)):
            values[i, j] = np.linalg.norm(ci[i] - ci[j])

    return np.max(values)


def dunn(k_list):
    """ Dunn index [CVI]

    Parameters
    ----------
    k_list : list of np.arrays
        A list containing a numpy array for each cluster |c| = number of clusters
        c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
    """
    deltas = np.ones([len(k_list), len(k_list)]) * 1000000
    big_deltas = np.zeros([len(k_list), 1])
    l_range = list(range(0, len(k_list)))

    for k in l_range:
        for l in (l_range[0:k] + l_range[k + 1:]):
            deltas[k, l] = delta(k_list[k], k_list[l])

        big_deltas[k] = big_delta(k_list[k])

    di = np.min(deltas) / np.max(big_deltas)
    return di


def delta_fast(ck, cl, distances):
    values = distances[np.where(ck)][:, np.where(cl)]
    values = values[np.nonzero(values)]

    return np.min(values)


def big_delta_fast(ci, distances):
    values = distances[np.where(ci)][:, np.where(ci)]
    values = values[np.nonzero(values)]

    return np.max(values)


def dunn_fast(points, labels):
    """ Dunn index - FAST (using sklearn pairwise euclidean_distance function)

    Parameters
    ----------
    points : np.array
        np.array([N, p]) of all points
    labels: np.array
        np.array([N]) labels of all points
    """
    distances = euclidean_distances(points)
    ks = np.sort(np.unique(labels))

    deltas = np.ones([len(ks), len(ks)]) * 1000000
    big_deltas = np.zeros([len(ks), 1])

    l_range = list(range(0, len(ks)))

    for k in l_range:
        for l in (l_range[0:k] + l_range[k + 1:]):
            deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances)

        big_deltas[k] = big_delta_fast((labels == ks[k]), distances)

    di = np.min(deltas) / np.max(big_deltas)
    return di


def big_s(x, center):
    len_x = len(x)
    total = 0

    for i in range(len_x):
        total += np.linalg.norm(x[i] - center)

    return total / len_x


def davisbouldin(k_list, k_centers):
    """ Davis Bouldin Index

    Parameters
    ----------
    k_list : list of np.arrays
        A list containing a numpy array for each cluster |c| = number of clusters
        c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
    k_centers : np.array
        The array of the cluster centers (prototypes) of type np.array([K, p])
    """
    len_k_list = len(k_list)
    big_ss = np.zeros([len_k_list], dtype=np.float64)
    d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64)
    db = 0

    for k in range(len_k_list):
        big_ss[k] = big_s(k_list[k], k_centers[k])

    for k in range(len_k_list):
        for l in range(0, len_k_list):
            d_eucs[k, l] = np.linalg.norm(k_centers[k] - k_centers[l])

    for k in range(len_k_list):
        values = np.zeros([len_k_list - 1], dtype=np.float64)
        for l in range(0, k):
            values[l] = (big_ss[k] + big_ss[l]) / d_eucs[k, l]
        for l in range(k + 1, len_k_list):
            values[l - 1] = (big_ss[k] + big_ss[l]) / d_eucs[k, l]

        db += np.max(values)
    res = db / len_k_list
    return res