python 实现SOM: 函数更新

因作业要求，我在之前的代码（python 实现SOM：代码注释与应用示例）上做了一点修改（增加了几个简单的函数与功能），这里同时对各个函数的功能做一个列表提示。

先创建一个MiniSom的类对象：

som = MiniSom(x,y,input_len,sigma=1.0,learning_rate=0.5,neighborhood_function='gaussian')
"""
参数说明（这里只列出了几个常用的参数）：
x: 竞争层x维度，int
y: 竞争层y维度，int
input_len:输入层的向量维度，int
sigma: 获胜邻域的初始半径（参数）
neighborhood_function： 获胜邻域的计算方式，常用的包括“gaussian”，"bubble"，"mexican_hat"，'triangle'，"winner_take_all"。其中"winner_take_all"模式是我新加的，等价于"bubble"下sigma=1的情况。"""

常用的函数功能介绍如下：

初始化函数：

不使用以下初始化函数时，网络的自带初始化算法，把权重初始化为[-1,1]之间的小数，

som.random_weights_init(data):从数据集中随机采样初始化网络权重，

som.random_weights_init_random01():（新加）用（0，1）之间的小数初始化权重，

som.pca_weights_init(data):用数据集计算PCA提取主分量初始化网络权重，有时更利于收敛

网络训练函数：

som.train(data, num_iteration, random_order=False, verbose=False):用数据集训练网络。random_order=True : 打乱data中的顺序输入som
    random_order=False : 按data中的样本顺序输入som

som.train_random(data, num_iteration,verbose=False):等价于som.train(data, num_iteration, random_order=True, verbose=False)

train_batch(self, data, num_iteration, verbose=False):等价于som.train(data, num_iteration, random_order=False, verbose=False)

网络应用分析函数：

som.get_weights():返回一个张量表示所有神经元的权重，shape=[x,y,input_len],weight[i,j,:]表示网络的第（i,j）位置处的神经元的权重矢量。

som.winner(x): 返回一个坐标，样本x激活的竞争层神经元的坐标。

som.activation_response(data): 返回一个矩阵，统计每个神经元被激活的次数

som.win_map(data, return_indices=False):返回一个字典，反映每个神经元收集到的样本。字典的索引是竞争神经元的坐标，字典的内容是该神经元收集到的数据集样本，当return_indices=True时字典的内容是该神经元收集到的样本在data中的索引。

som.labels_map(data, labels):返回一个双层字典，反映每个神经元收集到的标签种类，及每个标签下的样本个数。

som.get_data_winner_map(data):（新加）返回一个矩阵，反映data中每个样本的神经元激活结果，是一个01矩阵，适用于“winner_take_all”模式。可用于CPN网络。

som.activate(x):返回一个矩阵，描述输入样本x到所有神经元的距离。

源码如下：

from math import sqrt
import numpy as np
from numpy import (array, unravel_index, nditer, linalg, random, subtract, max,power, exp, pi, zeros, ones, arange, outer, meshgrid, dot,logical_and, mean, std, cov, argsort, linspace, transpose,einsum, prod, nan, sqrt, hstack, diff, argmin, multiply)
from numpy import sum as npsum
from numpy.linalg import norm
from collections import defaultdict, Counter
from warnings import warn
from sys import stdout
from time import time
from datetime import timedelta
import pickle
import os# for unit tests
from numpy.testing import assert_almost_equal, assert_array_almost_equal
from numpy.testing import assert_array_equal
import unittest"""Minimalistic implementation of the Self Organizing Maps (SOM).
"""def _build_iteration_indexes(data_len, num_iterations,verbose=False, random_generator=None):"""Returns an iterable with the indexes of the samplesto pick at each iteration of the training.If random_generator is not None, it must be an instanceof numpy.random.RandomState and it will be usedto randomize the order of the samples."""iterations = arange(num_iterations) % data_lenif random_generator:random_generator.shuffle(iterations)if verbose:return _wrap_index__in_verbose(iterations)else:return iterationsdef _wrap_index__in_verbose(iterations):"""Yields the values in iterations printing the status on the stdout."""m = len(iterations)digits = len(str(m))progress = '\r [ {s:{d}} / {m} ] {s:3.0f}% - ? it/s'progress = progress.format(m=m, d=digits, s=0)stdout.write(progress)beginning = time()stdout.write(progress)for i, it in enumerate(iterations):yield itsec_left = ((m-i+1) * (time() - beginning)) / (i+1)time_left = str(timedelta(seconds=sec_left))[:7]progress = '\r [ {i:{d}} / {m} ]'.format(i=i+1, d=digits, m=m)progress += ' {p:3.0f}%'.format(p=100*(i+1)/m)progress += ' - {time_left} left '.format(time_left=time_left)stdout.write(progress)def fast_norm(x):"""快速计算向量的二范数，速度比linalg.norm快。Returns norm-2 of a 1-D numpy array.* faster than linalg.norm in case of 1-D arrays (numpy 1.9.2rc1)."""return sqrt(dot(x, x.T))def asymptotic_decay(learning_rate, t, max_iter):"""Decay function of the learning process.Parameters----------learning_rate : floatcurrent learning rate.t : intcurrent iteration.max_iter : intmaximum number of iterations for the training."""return learning_rate / (1+t/(max_iter/2))class MiniSom(object):def __init__(self, x, y, input_len, sigma=1.0, learning_rate=0.5,decay_function=asymptotic_decay,neighborhood_function='gaussian', topology='rectangular',activation_distance='euclidean', random_seed=None):"""Initializes a Self Organizing Maps.A rule of thumb to set the size of the grid for a dimensionalityreduction task is that it should contain 5*sqrt(N) neuronswhere N is the number of samples in the dataset to analyze.E.g. if your dataset has 150 samples, 5*sqrt(150) = 61.23hence a map 8-by-8 should perform well.Parameters----------x : intx dimension of the SOM.y : inty dimension of the SOM.input_len : intNumber of the elements of the vectors in input.sigma : float, optional (default=1.0)Spread of the neighborhood function, needs to be adequateto the dimensions of the map.(at the iteration t we have sigma(t) = sigma / (1 + t/T)where T is #num_iteration/2)learning_rate : initial learning rate(at the iteration t we havelearning_rate(t) = learning_rate / (1 + t/T)where T is #num_iteration/2)decay_function : function (default=None)Function that reduces learning_rate and sigma at each iterationthe default function is:learning_rate / (1+t/(max_iterarations/2))A custom decay function will need to to take in inputthree parameters in the following order:1. learning rate2. current iteration3. maximum number of iterations allowedNote that if a lambda function is used to define the decayMiniSom will not be pickable anymore.neighborhood_function : string, optional (default='gaussian')Function that weights the neighborhood of a position in the map.Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'topology : string, optional (default='rectangular')Topology of the map.Possible values: 'rectangular', 'hexagonal'activation_distance : string, optional (default='euclidean')Distance used to activate the map.Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'random_seed : int, optional (default=None)Random seed to use."""if sigma >= x and sigma >= y:warn('Warning: sigma is too high for the dimension of the map.')self._random_generator = random.RandomState(random_seed)self._learning_rate = learning_rateself._sigma = sigma #self._input_len = input_len# random initializationself._weights = self._random_generator.rand(x, y, input_len)*2-1 # 网络权重定义与随机初始化self._weights /= linalg.norm(self._weights, axis=-1, keepdims=True)self._activation_map = zeros((x, y))self._neigx = arange(x)  # 近邻x,是一个列表self._neigy = arange(y)  # used to evaluate the neighborhood functionif topology not in ['hexagonal', 'rectangular']:msg = '%s not supported only hexagonal and rectangular available'raise ValueError(msg % topology)self.topology = topologyself._xx, self._yy = meshgrid(self._neigx, self._neigy)self._xx = self._xx.astype(float)self._yy = self._yy.astype(float)if topology == 'hexagonal':self._xx[::-2] -= 0.5if neighborhood_function in ['triangle']:warn('triangle neighborhood function does not ' +'take in account hexagonal topology')self._decay_function = decay_functionneig_functions = {'gaussian': self._gaussian,'mexican_hat': self._mexican_hat,'bubble': self._bubble,'triangle': self._triangle,'winner_take_all': self._winner_take_all}if neighborhood_function not in neig_functions:msg = '%s not supported. Functions available: %s'raise ValueError(msg % (neighborhood_function,', '.join(neig_functions.keys())))if neighborhood_function in ['triangle','bubble'] and (divmod(sigma, 1)[1] != 0or sigma < 1):warn('sigma should be an integer >=1 when triangle or bubble' +'are used as neighborhood function')self.neighborhood = neig_functions[neighborhood_function]distance_functions = {'euclidean': self._euclidean_distance,'cosine': self._cosine_distance,'manhattan': self._manhattan_distance,'chebyshev': self._chebyshev_distance}if activation_distance not in distance_functions:msg = '%s not supported. Distances available: %s'raise ValueError(msg % (activation_distance,', '.join(distance_functions.keys())))self._activation_distance = distance_functions[activation_distance]# 选择不同的距离计算方式，计算x到w的距离def get_weights(self):"""获取神经元权重Returns the weights of the neural network."""return self._weightsdef get_euclidean_coordinates(self):"""Returns the position of the neurons on an euclideanplane that reflects the chosen topology in two meshgrids xx and yy.Neuron with map coordinates (1, 4) has coordinate (xx[1, 4], yy[1, 4])in the euclidean plane.Only useful if the topology chosen is not rectangular.获取欧几里得平面上，获取神经元的坐标，当拓扑结构为非矩形时才有用"""return self._xx.T, self._yy.Tdef convert_map_to_euclidean(self, xy):"""Converts map coordinates into euclidean coordinatesthat reflects the chosen topology.Only useful if the topology chosen is not rectangular."""return self._xx.T[xy], self._yy.T[xy]def _activate(self, x):"""功能：计算输入样本到所有竞争层神经元的距离，返回一个矩阵。是一个距离矩阵。并把距离矩阵存储到self._activation_map中Updates matrix activation_map, in this matrixthe element i,j is the response of the neuron i,j to x."""self._activation_map = self._activation_distance(x, self._weights)def activate(self, x):"""返回x到所有神经元的距离矩阵Returns the activation map to x."""self._activate(x)return self._activation_mapdef _gaussian(self, c, sigma):"""Returns a Gaussian centered in c.定义领域"""d = 2*pi*sigma*sigma # 高斯函数的方差，当方差为0时ax = exp(-power(self._xx-self._xx.T[c], 2)/d)ay = exp(-power(self._yy-self._yy.T[c], 2)/d)spread_map = (ax * ay).T  # the external product gives a matrix 返回一个外积矩阵return spread_mapdef _mexican_hat(self, c, sigma):"""Mexican hat centered in c."""p = power(self._xx-self._xx.T[c], 2) + power(self._yy-self._yy.T[c], 2)d = 2*pi*sigma*sigmaspread_map = (exp(-p/d)*(1-2/d*p)).Treturn spread_mapdef _bubble(self, c, sigma):"""Constant function centered in c with spread sigma.sigma should be an odd value."""ax = logical_and(self._neigx > c[0]-sigma,self._neigx < c[0]+sigma)ay = logical_and(self._neigy > c[1]-sigma,self._neigy < c[1]+sigma)spread_map = outer(ax, ay)*1.return spread_mapdef _bubble_1D(self, c, sigma):"""Constant function centered in c with spread sigma.sigma should be an odd value."""ax = logical_and(self._neigx > c[0]-sigma,self._neigx < c[0]+sigma)ay = (self._neigy == c[1])spread_map = outer(ax, ay)*1.return spread_mapdef _winner_take_all(self, c, sigma):"""Constant function centered in c with spread sigma.sigma should be an odd value."""ax = (self._neigx == c[0])ay = (self._neigy == c[1])spread_map = outer(ax, ay)*1.return spread_mapdef _triangle(self, c, sigma):"""Triangular function centered in c with spread sigma."""triangle_x = (-abs(c[0] - self._neigx)) + sigmatriangle_y = (-abs(c[1] - self._neigy)) + sigmatriangle_x[triangle_x < 0] = 0.triangle_y[triangle_y < 0] = 0.spread_map = outer(triangle_x, triangle_y)return spread_mapdef _cosine_distance(self, x, w):num = (w * x).sum(axis=2)denum = multiply(linalg.norm(w, axis=2), linalg.norm(x))return 1 - num / (denum+1e-8)def _euclidean_distance(self, x, w):"""计算一个样本到一个神经元的距离，返回的是一个标量，欧式距离:param x::param w::return:"""return linalg.norm(subtract(x, w), axis=-1)# subtract是减法，x-w。np.linalg.norm(求范数)axis=1表示按行向量处理，求多个行向量的范数，axis=0表示按列向量处理，求多个列向量的范数def _manhattan_distance(self, x, w):return linalg.norm(subtract(x, w), ord=1, axis=-1)def _chebyshev_distance(self, x, w):return max(subtract(x, w), axis=-1)def _check_iteration_number(self, num_iteration):if num_iteration < 1:raise ValueError('num_iteration must be > 1')def _check_input_len(self, data):"""Checks that the data in input is of the correct shape."""data_len = len(data[0])if self._input_len != data_len:msg = 'Received %d features, expected %d.' % (data_len,self._input_len)raise ValueError(msg)def winner(self, x):"""获胜者：计算输入样本x的获胜神经元的坐标Computes the coordinates of the winning neuron for the sample x."""self._activate(x)return unravel_index(self._activation_map.argmin(), # 距离近的获胜self._activation_map.shape)def get_data_winner_map(self,data):"""返回一个矩阵：记录数据集中每一个样本的获胜者，获胜神经元记为1，其余记为0:param data::return:"""num = data.shape[0]data_winner_map = np.zeros((num, 20))for i in range(num):winner = self.winner(data[i])data_winner_map[i, winner[1]] = 1.return data_winner_mapdef update(self, x, win, t, max_iteration):"""神经元的权重更新：第一步：按照当前的迭代次数，计算当前的学习率和扩展范围第二步：计算每个激活神经元的学习率第三步：按照更新规则更新权重Updates the weights of the neurons.更新神经元的权重Parameters----------x : np.array 当前输入向量Current pattern to learn.win : tuple 获胜的神经元的列表Position of the winning neuron for x (array or tuple).t : int  当前的迭代次数，用于计算学习率，激活区域大小Iteration indexmax_iteration : int 最大的迭代次数，用于计算学习率，激活区域大小Maximum number of training itarations."""eta = self._decay_function(self._learning_rate, t, max_iteration) # 更新学习率# sigma and learning rate decrease with the same rulesig = self._decay_function(self._sigma, t, max_iteration) # 更新激活范围# improves the performancesg = self.neighborhood(win, sig)*eta # 根据获胜神经元，获取其邻域，计算邻域的学习率# w_new = eta * neighborhood_function * (x-w)self._weights += einsum('ij, ijk->ijk', g, x-self._weights)def quantization(self, data):"""量化：返回距离数据集最近的神经元的权重，并把这个权重重复，分配给每个样本。Assigns a code book (weights vector of the winning neuron)to each sample in data."""self._check_input_len(data)# 所有样本与所有神经元之间的所有距离的最小值winners_coords = argmin(self._distance_from_weights(data), axis=1) # 获取获胜神经元的坐标print(winners_coords)return self._weights[unravel_index(winners_coords,self._weights.shape[:2])]def random_weights_init(self, data):"""随机权重初始化：利用数据集中的样本随机初始化权重Initializes the weights of the SOMpicking random samples from data."""self._check_input_len(data)it = nditer(self._activation_map, flags=['multi_index'])while not it.finished:rand_i = self._random_generator.randint(len(data))self._weights[it.multi_index] = data[rand_i]it.iternext()def random_weights_init_random01(self):"""随机权重初始化：利用[0,1]之间的小数初始化权重并归一化"""self._weights = np.random.random(self._weights.shape)self._weights /= np.linalg.norm(self._weights, axis=-1, keepdims=True)def pca_weights_init(self, data):"""PCA方法初始化权重：先计算输入样本的两个PCA主分量，用两个主分量的加权和初始化网络权重。该初始化方法使训练过程收敛更快强烈推荐在初始化权重前将样本归一化，并且在训练过程中使用相同的归一化。Initializes the weights to span the first two principal components.This initialization doesn't depend on random processes andmakes the training process converge faster.It is strongly reccomended to normalize the data before initializingthe weights and use the same normalization for the training data."""if self._input_len == 1:msg = 'The data needs at least 2 features for pca initialization'raise ValueError(msg)self._check_input_len(data)if len(self._neigx) == 1 or len(self._neigy) == 1:msg = 'PCA initialization inappropriate:' + \'One of the dimensions of the map is 1.'warn(msg)pc_length, pc = linalg.eig(cov(transpose(data))) # np.cov计算协方差，np.linalg.eig计算矩阵的特征向量pc_order = argsort(-pc_length)# 特征值排序，argsort函数返回的是数组值从小到大的索引值for i, c1 in enumerate(linspace(-1, 1, len(self._neigx))):for j, c2 in enumerate(linspace(-1, 1, len(self._neigy))):self._weights[i, j] = c1*pc[pc_order[0]] + c2*pc[pc_order[1]]# 上面：pc[pc_order[0]] 表示最大特征值对应的特征向量#      pc[pc_order[1]] 表示第二大特征值对应的特征向量#      c1*pc[pc_order[0]] + c2*pc[pc_order[1]] 表示两个特征向量的加权和def train(self, data, num_iteration, random_order=False, verbose=False):"""Trains the SOM.Parameters----------data : np.array or listData matrix.num_iteration : intMaximum number of iterations (one iteration per sample).random_order : bool (default=False)用以决定是否从数据集中随机采集样本训练If True, samples are picked in random order.Otherwise the samples are picked sequentially.verbose : bool (default=False) 用以决定是否打印迭代信息If True the status of the trainingwill be printed at each iteration."""self._check_iteration_number(num_iteration)self._check_input_len(data)random_generator = Noneif random_order:random_generator = self._random_generatoriterations = _build_iteration_indexes(len(data), num_iteration,verbose, random_generator)for t, iteration in enumerate(iterations):self.update(data[iteration], self.winner(data[iteration]),t, num_iteration)if verbose:print('\n quantization error:', self.quantization_error(data))def train_random(self, data, num_iteration, verbose=False):"""Trains the SOM picking samples at random from data.Parameters----------data : np.array or listData matrix.num_iteration : intMaximum number of iterations (one iteration per sample).verbose : bool (default=False)If True the status of the trainingwill be printed at each iteration."""self.train(data, num_iteration, random_order=True, verbose=verbose)def train_batch(self, data, num_iteration, verbose=False):"""按数据集的样本顺序训练SOMTrains the SOM using all the vectors in data sequentially.Parameters----------data : np.array or listData matrix.num_iteration : intMaximum number of iterations (one iteration per sample).verbose : bool (default=False)If True the status of the trainingwill be printed at each iteration."""self.train(data, num_iteration, random_order=False, verbose=verbose)def distance_map(self):"""权重的距离地图每个元素是一个神经元与它近邻之间的距离之和。Returns the distance map of the weights.Each cell is the normalised sum of the distances betweena neuron and its neighbours. Note that this method usesthe euclidean distance."""um = zeros((self._weights.shape[0],self._weights.shape[1],8))  # 2 spots more for hexagonal topology 对于六边形来说，有两个是多余的# 定义了一个矩形的八个邻域 以当前区域坐标为[0,0]ii = [[0, -1, -1, -1, 0, 1, 1, 1]]*2jj = [[-1, -1, 0, 1, 1, 1, 0, -1]]*2if self.topology == 'hexagonal':# 六边形拓扑结构ii = [[1, 1, 1, 0, -1, 0], [0, 1, 0, -1, -1, -1]]jj = [[1, 0, -1, -1, 0, 1], [1, 0, -1, -1, 0, 1]]# 遍历每个神经元for x in range(self._weights.shape[0]):for y in range(self._weights.shape[1]):w_2 = self._weights[x, y] # 提取神经元的权重e = y % 2 == 0   # only used on hexagonal topology # 仅仅在六边形中应用# 对于该神经元的第k个邻域，坐标为[i,j]for k, (i, j) in enumerate(zip(ii[e], jj[e])):# 如果这个邻域存在/没有超出som竞争层的边界if (x+i >= 0 and x+i < self._weights.shape[0] andy+j >= 0 and y+j < self._weights.shape[1]):w_1 = self._weights[x+i, y+j] # 获取这个邻域的权重um[x, y, k] = fast_norm(w_2-w_1) # 并计算这两个神经元之间的欧式距离，um = um.sum(axis=2) # 将每个神经元与其相邻神经元之间的距离求累加和return um/um.max() # 利用最大距离归一化def activation_response(self, data):"""记录神经元被激活的次数激活响应：返回一个矩阵，矩阵的第[i,j]个元素，反应第[i,j]个神经元在数据集data中被激活的次数Returns a matrix where the element i,j is the number of timesthat the neuron i,j have been winner."""self._check_input_len(data)a = zeros((self._weights.shape[0], self._weights.shape[1]))for x in data:a[self.winner(x)] += 1return adef _distance_from_weights(self, data):"""返回一个矩阵：反应第i个样本与第j个神经元之间的距离（欧式距离）Returns a matrix d where d[i,j] is the euclidean distance betweendata[i] and the j-th weight."""input_data = array(data)weights_flat = self._weights.reshape(-1, self._weights.shape[2])# 把神经元的拓扑结构拉直input_data_sq = power(input_data, 2).sum(axis=1, keepdims=True)weights_flat_sq = power(weights_flat, 2).sum(axis=1, keepdims=True)cross_term = dot(input_data, weights_flat.T)return sqrt(-2 * cross_term + input_data_sq + weights_flat_sq.T)def quantization_error(self, data):"""量化误差：计算数据集中的样本到最佳匹配神经元的距离的均值。最佳匹配神经元：到数据集最近的神经元Returns the quantization error computed as the averagedistance between each input sample and its best matching unit."""self._check_input_len(data)return norm(data-self.quantization(data), axis=1).mean()def topographic_error(self, data):"""拓扑结构误差：对每个输入样本，找到最佳匹配神经元和第二最佳匹配神经元，评估他们的位置。当二者的位置不相邻时，把该样本统计为一个误差。拓扑结构误差 = 误差样本总数/所有样本总数如果拓扑结构误差=1，所有样本的拓扑结构都没有被保存。Returns the topographic error computed by findingthe best-matching and second-best-matching neuron in the mapfor each input and then evaluating the positions.A sample for which these two nodes are not adjacent counts asan error. The topographic error is given by thethe total number of errors divided by the total of samples.If the topographic error is 0, no error occurred.If 1, the topology was not preserved for any of the samples."""self._check_input_len(data)if self.topology == 'hexagonal':msg = 'Topographic error not implemented for hexagonal topology.'raise NotImplementedError(msg)total_neurons = prod(self._activation_map.shape) # 神经元的总个数if total_neurons == 1:warn('The topographic error is not defined for a 1-by-1 map.')return nant = 1.42# b2mu: best 2 matching unitsb2mu_inds = argsort(self._distance_from_weights(data), axis=1)[:, :2]# 最大的两个神经元的索引，一维坐标b2my_xy = unravel_index(b2mu_inds, self._weights.shape[:2])# 最大的两个神经元的索引，二维坐标b2mu_x, b2mu_y = b2my_xy[0], b2my_xy[1] # b2mu_x=（x1,x2）,b2mu_y=（y1,y2）dxdy = hstack([diff(b2mu_x), diff(b2mu_y)]) # dxdy = [dx,dy]distance = norm(dxdy, axis=1) # 计算两个神经元的欧式距离return (distance > t).mean() #统计距离超过1.42的样本的个数，并除以样本总个数def win_map(self, data, return_indices=False):"""赢者图：统计每个神经元收集到的样本。如果 return_indices=True, 那么统计每个神经元收集到的样本在数据集中的索引。返回一个字典：字典的索引是竞争神经元的坐标字典的内容是该神经元收集到的数据集样本。Returns a dictionary wm where wm[(i,j)] is a list with:- all the patterns that have been mapped to the position (i,j),if return_indices=False (default)- all indices of the elements that have been mapped to theposition (i,j) if return_indices=True"""self._check_input_len(data)winmap = defaultdict(list)for i, x in enumerate(data):winmap[self.winner(x)].append(i if return_indices else x)return winmapdef labels_map(self, data, labels):"""标签图：统计每个神经元收集到的样本的标签种类，及每个标签的出现频次返回一个双层字典wm. wm是一个字典，wm的元素wm[(i,j)]也是一个字典wm的索引是神经元的坐标wm[(i,j)]的索引是样本的标签，内容是该标签出现的频次/该标签被映射到这个神经元的次数Returns a dictionary wm where wm[(i,j)] is a dictionarythat contains the number of samples from a given labelthat have been mapped in position i,j.----------data : np.array or listData matrix.label : np.array or listLabels for each sample in data."""self._check_input_len(data)if not len(data) == len(labels):raise ValueError('data and labels must have the same length.')winmap = defaultdict(list) # 是一个字典for x, l in zip(data, labels):winmap[self.winner(x)].append(l) # 每一个神经元是一个字典条目，name=神经元的索引，内容是这个神经元下的不同标签，是一个列表：归类到这个神经元下的样本的标签for position in winmap: # 遍历每一个神经元，统计分配到这个神经元下样本的个数winmap[position] = Counter(winmap[position]) # counter 输入一个列表，返回一个字典，字典的内容是：在列表中每个元素出现的次数return winmap # 是一个双层字典，存储每个神经元下收集到的不同样本的标签，并统计每个标签出现的频次class TestMinisom(unittest.TestCase):"""测试类，测试MiniSom的每个函数"""def setUp(self):self.som = MiniSom(5, 5, 1)for i in range(5):for j in range(5):# checking weights normalizationassert_almost_equal(1.0, linalg.norm(self.som._weights[i, j]))self.som._weights = zeros((5, 5, 1))  # fake weightsself.som._weights[2, 3] = 5.0self.som._weights[1, 1] = 2.0def test_decay_function(self):assert self.som._decay_function(1., 2., 3.) == 1./(1.+2./(3./2))def test_fast_norm(self):assert fast_norm(array([1, 3])) == sqrt(1+9)def test_euclidean_distance(self):x = zeros((1, 2))w = ones((2, 2, 2))d = self.som._euclidean_distance(x, w)assert_array_almost_equal(d, [[1.41421356, 1.41421356],[1.41421356, 1.41421356]])def test_cosine_distance(self):x = zeros((1, 2))w = ones((2, 2, 2))d = self.som._cosine_distance(x, w)assert_array_almost_equal(d, [[1., 1.],[1., 1.]])def test_manhattan_distance(self):x = zeros((1, 2))w = ones((2, 2, 2))d = self.som._manhattan_distance(x, w)assert_array_almost_equal(d, [[2., 2.],[2., 2.]])def test_chebyshev_distance(self):x = array([1, 3])w = ones((2, 2, 2))d = self.som._chebyshev_distance(x, w)assert_array_almost_equal(d, [[2., 2.],[2., 2.]])def test_check_input_len(self):with self.assertRaises(ValueError):self.som.train_batch([[1, 2]], 1)with self.assertRaises(ValueError):self.som.random_weights_init(array([[1, 2]]))with self.assertRaises(ValueError):self.som._check_input_len(array([[1, 2]]))self.som._check_input_len(array([[1]]))self.som._check_input_len([[1]])def test_unavailable_neigh_function(self):with self.assertRaises(ValueError):MiniSom(5, 5, 1, neighborhood_function='boooom')def test_unavailable_distance_function(self):with self.assertRaises(ValueError):MiniSom(5, 5, 1, activation_distance='ridethewave')def test_gaussian(self):bell = self.som._gaussian((2, 2), 1)assert bell.max() == 1.0assert bell.argmax() == 12  # unravel(12) = (2,2)def test_mexican_hat(self):bell = self.som._mexican_hat((2, 2), 1)assert bell.max() == 1.0assert bell.argmax() == 12  # unravel(12) = (2,2)def test_bubble(self):bubble = self.som._bubble((2, 2), 1)assert bubble[2, 2] == 1assert sum(sum(bubble)) == 1def test_triangle(self):bubble = self.som._triangle((2, 2), 1)assert bubble[2, 2] == 1assert sum(sum(bubble)) == 1def test_win_map(self):winners = self.som.win_map([[5.0], [2.0]])assert winners[(2, 3)][0] == [5.0]assert winners[(1, 1)][0] == [2.0]def test_win_map_indices(self):winners = self.som.win_map([[5.0], [2.0]], return_indices=True)assert winners[(2, 3)] == [0]assert winners[(1, 1)] == [1]def test_labels_map(self):labels_map = self.som.labels_map([[5.0], [2.0]], ['a', 'b'])assert labels_map[(2, 3)]['a'] == 1assert labels_map[(1, 1)]['b'] == 1with self.assertRaises(ValueError):self.som.labels_map([[5.0]], ['a', 'b'])def test_activation_reponse(self):response = self.som.activation_response([[5.0], [2.0]])assert response[2, 3] == 1assert response[1, 1] == 1def test_activate(self):assert self.som.activate(5.0).argmin() == 13.0  # unravel(13) = (2,3)def test_distance_from_weights(self):data = arange(-5, 5).reshape(-1, 1)weights = self.som._weights.reshape(-1, self.som._weights.shape[2])distances = self.som._distance_from_weights(data)for i in range(len(data)):for j in range(len(weights)):assert(distances[i][j] == norm(data[i] - weights[j]))def test_quantization_error(self):assert self.som.quantization_error([[5], [2]]) == 0.0assert self.som.quantization_error([[4], [1]]) == 1.0def test_topographic_error(self):# 5 will have bmu_1 in (2,3) and bmu_2 in (2, 4)# which are in the same neighborhoodself.som._weights[2, 4] = 6.0# 15 will have bmu_1 in (4, 4) and bmu_2 in (0, 0)# which are not in the same neighborhoodself.som._weights[4, 4] = 15.0self.som._weights[0, 0] = 14.assert self.som.topographic_error([[5]]) == 0.0assert self.som.topographic_error([[15]]) == 1.0self.som.topology = 'hexagonal'with self.assertRaises(NotImplementedError):assert self.som.topographic_error([[5]]) == 0.0self.som.topology = 'rectangular'def test_quantization(self):q = self.som.quantization(array([[4], [2]]))assert q[0] == 5.0assert q[1] == 2.0def test_random_seed(self):som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)# same initializationassert_array_almost_equal(som1._weights, som2._weights)data = random.rand(100, 2)som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)som1.train_random(data, 10)som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)som2.train_random(data, 10)# same state after trainingassert_array_almost_equal(som1._weights, som2._weights)def test_train_batch(self):som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)data = array([[4, 2], [3, 1]])q1 = som.quantization_error(data)som.train(data, 10)assert q1 > som.quantization_error(data)data = array([[1, 5], [6, 7]])q1 = som.quantization_error(data)som.train_batch(data, 10, verbose=True)assert q1 > som.quantization_error(data)def test_train_random(self):som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)data = array([[4, 2], [3, 1]])q1 = som.quantization_error(data)som.train(data, 10, random_order=True)assert q1 > som.quantization_error(data)data = array([[1, 5], [6, 7]])q1 = som.quantization_error(data)som.train_random(data, 10, verbose=True)assert q1 > som.quantization_error(data)def test_random_weights_init(self):som = MiniSom(2, 2, 2, random_seed=1)som.random_weights_init(array([[1.0, .0]]))for w in som._weights:assert_array_equal(w[0], array([1.0, .0]))def test_pca_weights_init(self):som = MiniSom(2, 2, 2)som.pca_weights_init(array([[1.,  0.], [0., 1.], [1., 0.], [0., 1.]]))expected = array([[[0., -1.41421356], [-1.41421356, 0.]],[[1.41421356, 0.], [0., 1.41421356]]])assert_array_almost_equal(som._weights, expected)def test_distance_map(self):som = MiniSom(2, 2, 2, random_seed=1)som._weights = array([[[1.,  0.], [0., 1.]], [[1., 0.], [0., 1.]]])assert_array_equal(som.distance_map(), array([[1., 1.], [1., 1.]]))som = MiniSom(2, 2, 2, topology='hexagonal', random_seed=1)som._weights = array([[[1.,  0.], [0., 1.]], [[1., 0.], [0., 1.]]])assert_array_equal(som.distance_map(), array([[.5, 1.], [1., .5]]))def test_pickling(self):with open('som.p', 'wb') as outfile:pickle.dump(self.som, outfile)with open('som.p', 'rb') as infile:pickle.load(infile)os.remove('som.p')