123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315 |
- # import
- # ------------------------------------------------------------------------------------------
- import os
- import matplotlib.pyplot as plt
- import numpy as np
- import copy
- # multiprocessing and functools
- # import multiprocessing as mp
- from pathos.multiprocessing import ProcessingPool as Pool
- from functools import partial
- # miam import
- import miam.utils
- import miam.image.Image as MIMG
- import miam.processing.ColorSpaceTransform as MCST
- import miam.histogram.Histogram as MHIST
- import miam.math as MMATH
- # ------------------------------------------------------------------------------------------
- # MIAM project 2020
- # ------------------------------------------------------------------------------------------
- # author: remi.cozot@univ-littoral.fr
- # ------------------------------------------------------------------------------------------
- class kmeans(object):
- # set a random seed for reproductability
- #np.random.seed(1968)
- np.random.seed(1968)
- def __init__(self, distance, normalize):
- # distance use for computation between samples
- self.distance = distance
- self.normalize = normalize
- def MPassignSamplesToCentroids(self,centroids, samples, previousAssigmentIdx):
- # parallel function
- def assignSampleToCentroid(previousAssigmentIdx, centroids, distance,isamp):
- """ return (jdist, samp, i, dist, change, remain) """
- # recovering data from parameters
- i, samp = isamp
- # init data in centroids loop
- dist = 0.0 # distance to centroid
- jdist = 0 # minimal distance
- for j,cent in enumerate(centroids):
- #compute distance samps[i] et cents[j]
- if j==0:
- # first iteration
- dist = distance.eval(samp,cent)
- jdist =0
- else:
- # other iteration
- d= distance.eval(samp,cent)
- # compare dist to current minimal dist
- if d<dist:
- dist =d
- jdist=j
- # end for cents
- if not i in previousAssigmentIdx[jdist]:
- change, remain = 1, 0
- else:
- change, remain = 0, 1
- #return data
- return (jdist, samp, i, dist, change, remain)
- # create partial to avoid multiple input parameters
- pAss = partial(assignSampleToCentroid, previousAssigmentIdx, centroids, self.distance)
- # prepare input
- isamps = list(enumerate(samples))
- # parallel computation
- _pool = Pool()
- rawResults = _pool.map(pAss, isamps) # launching on all samples
- results =list(rawResults)
- # formatting results
- numberSamples = len(samples)
- numberOfChange, numberOfRemain = 0,0 # number of samples that change of/remain in centroid
- sumDistance = 0 # sum of distance to centroids
- assigments, assigmentsIdx= [[]], [[]] # return list
- for i in range(len(centroids)-1):
- assigments.append([])
- assigmentsIdx.append([])
- for res in results:
- jdist, samp, i, dist, change, remain = res
- assigments[jdist].append(samp)
- assigmentsIdx[jdist].append(i)
- sumDistance += dist
- numberOfChange += change
- numberOfRemain += remain
- # add data to follow convergence
- conv = (numberOfChange,numberOfRemain, sumDistance/numberSamples)
- return (assigments, assigmentsIdx, conv)
- def assignSamplesToCentroids(self,centroids, samples, previousAssigmentIdx):
- # assChanged set to True if at least one assignment changes of centroid
- assChanged = False
- # number of samples that change of/remain in centroid
- numberOfChange, numberOfRemain = 0,0
- # sum of distance to centroids
- sumDistance = 0
- # return list
- assigments, assigmentsIdx= [[]], [[]]
- for i in range(len(centroids)-1):
- assigments.append([])
- assigmentsIdx.append([])
- numberSamples = len(samples)
- # DEBUG
- print("")
- # END DEBUG
- for i,samp in enumerate(samples):
- # DEBUG
- print("\r kmeans.assignSamplesToCentroids: ",i,"/",numberSamples,"[remains:",numberOfRemain,"][changes:",numberOfChange,"][mean distance:",sumDistance/(i+1),"] ", end = '\r')
- # END DEBUG
- dist = 0.0
- jdist = 0
- for j,cent in enumerate(centroids):
- #compute distance samps[i] et cents[j]
- if j==0:
- # first iteration
- dist = self.distance.eval(samp,cent)
- jdist =0
- else:
- # other iteration
- d= self.distance.eval(samp,cent)
- # compare dist to current minimal dist
- if d<dist:
- dist =d
- jdist=j
- # end if
- #end if
- # end for cents
- assigments[jdist].append(samp)
- assigmentsIdx[jdist].append(i)
- sumDistance += dist
- if not i in previousAssigmentIdx[jdist]:
- assChanged = True
- numberOfChange += 1
- else:
- numberOfRemain +=1
- # end for samp
- # add dta to follow convergence
- conv = (numberOfChange,numberOfRemain, sumDistance/numberSamples)
- return (assigments, assigmentsIdx, conv)
- def averageAssigments(self, assignements):
- # debug
- # print("kmeans.averageAssigment>> start")
- # end debug
- # return list
- assigmentAverage = [[]]
- for i in range(len(assignements)-1): assigmentAverage.append([])
- for i,assigment_i in enumerate(assignements):
- # debug
- # print("kmeans.averageAssigment::sassigment_i.size>>",np.asarray(assigment_i).size)
- # end debug
- if np.asarray(assigment_i).size >0 :
- assavi=np.mean(np.asarray(assigment_i),axis=0)
- assavi = self.normalize.eval(assavi)
- assigmentAverage[i]= assavi
- # debug
- # print("kmeans.averageAssigment>> end")
- # end debug
-
- return assigmentAverage
- def kmeans(self,samples, nbClusters, nbIter, display = None, initRange=None, multiPro=False):
- """
- method keams:
- attribute(s):
- self: insytance method
- samples: samples to cluster (np.ndarray)
- samples.shape[0] : number of samples
- samples.shape[1:]: dimension of sample, if samples.shape[1:] is int then sample are vectors and amples.shape[1:] is vector size
- if samples.shape[1:] is tuple are matrices or tensors and amples.shape[1:] is matrix/tensor dimension
- exemple samples.shape[1:] =(5,3) sample is matrix 5x3
- nbClusters: number of cluster to compute (int)
- nbIter: number of iteration of k-means (int)
- display: class with init and plot method, plot is called at each iteration !!!!!!!!!!!!!!!!!!!! require refactoring !!!!!!!!!!!!!!!!!!!!
- initRange: None or list of tuple
- random centroids are used to init the k-means
- centroids are stored in a np.ndarray which shape is (number of clusters, *samples.shape[1:])
- samples.shape[-1] is size of "atomic" vector for example is centroids is 5x3 it means 5 vector of size 3 (the case for color palettes)
- init should be [{minRange0,maxRange0}, {minRange1,maxRange1}, {minRange2,maxRange2}]*5 with initRange = [(minRange0,maxRange0), (minRange1,maxRange1), (minRange2,maxRange2)]
- initRange=None range in 0..1
- """
- # dimension of centroids
- dimSamp = samples.shape
- dimCentroid = dimSamp[1:] # palette
- # dimCentroid = dimSamp[1] # histo
- # init centroids
- if isinstance(dimSamp,tuple):
- u = np.random.rand(nbClusters,*dimCentroid)
- if initRange:
- minRange, maxRange = [],[]
- for _range in initRange:
- minr, maxr = _range
- minRange.append(minr)
- maxRange.append(maxr)
- v = (1-u)*np.asarray(minRange) + u*np.asarray(maxRange)
- else:
- v =u
- centroids = self.normalize.evals(v) # palette
- else: #integer
- u = np.random.rand(nbClusters,dimCentroid)
- if initRange:
- minRange, maxRange = [],[]
- for _range in initRange:
- minr, maxr = _range
- minRange.append(minr)
- maxRange.append(maxr)
- v = (1-u)*np.asarray(minRange) + u*np.asarray(maxRange)
- else:
- v =u
- centroids = self.normalize.eval(v) # histo
- # return assigments and assigments index
- previousAssigmentsIdx = [[]]
- assigments,assigmentsIdx = [[]], [[]]
- for i in range(nbClusters-1):
- assigments.append([])
- assigmentsIdx.append([])
- previousAssigmentsIdx.append([])
- # convergence
- changes = []
- remains=[]
- meanDistances= []
- # MAIN LOOP
- # -----------------------------------------------------------------------------------------
- # for iter in range(nbIter)
- for iter in range(nbIter):
- print("\r","kmeans(iteration): ",iter,"/",nbIter,":",iter*100//nbIter," % ",end = '\r')
- # assign sample to centoids
- if multiPro:
- (assigments,assigmentsIdx,conv) = self.MPassignSamplesToCentroids(centroids,samples, previousAssigmentsIdx)
- else:
- (assigments,assigmentsIdx,conv) = self.assignSamplesToCentroids(centroids,samples, previousAssigmentsIdx)
- # recover data from results
- change, remain, meanDist = conv
- changes.append(change)
- remains.append(remain)
- meanDistances.append(meanDist)
- # compute mean of (assigment) cluster
- assigmentsAverage = self.averageAssigments(assigments)
-
- # update centroids and stopping criteria
- canBreak= True
- for i,ass_av in enumerate(assigmentsAverage):
- emptyAss = True
- if isinstance(ass_av,np.ndarray):
- if (ass_av.size!=0):
- emptyAss = False
- centroids[i] = ass_av
- if emptyAss:
- canBreak= False
- if isinstance(dimSamp,tuple):
- u = np.random.rand(1,*dimCentroid)
- if initRange:
- minRange, maxRange = [],[]
- for _range in initRange:
- minr, maxr = _range
- minRange.append(minr)
- maxRange.append(maxr)
- v = (1-u)*np.asarray(minRange) + u*np.asarray(maxRange)
- else: v =u
- newcentroid = self.normalize.evals(v) # palette
- else: # histogram
- u = np.random.rand(nbClusters,dimCentroid)
- if initRange:
- minRange, maxRange = [],[]
- for _range in initRange:
- minr, maxr = _range
- minRange.append(minr)
- maxRange.append(maxr)
- v = (1-u)*np.asarray(minRange) + u*np.asarray(maxRange)
- else:
- v =u
- newcentroid = self.normalize.eval(v) # histo
- centroids[i] = newcentroid
- print("")
- print("WARNING[miam.classification.kmeans(): (iteration:",iter,"/centroid:",i,"): no assigment! >> compute new centroid]")
- # display
- if display: display.plot(centroids, assigmentsIdx, iter,(changes,remains,meanDistances), len(samples))
- # memory
- previousAssigmentsIdx = copy.deepcopy(assigmentsIdx)
- # break iteration if change=0
- if (change==0) and(canBreak): break
- # -----------------------------------------------------------------------------------------
- # return centroids
- print(" ")
- return (centroids,assigments,assigmentsIdx)
|