From 7cec18aaf237d1d7ec9011b405befb95fa6ceca6 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 11:58:07 +0200 Subject: [PATCH 01/16] Update README.rst description of new functionalities for high dimensionality problems and improved performance --- README.rst | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/README.rst b/README.rst index f2e7c93..18e30e1 100644 --- a/README.rst +++ b/README.rst @@ -53,3 +53,52 @@ The parameter passed to getclusters is the count of clusters generated. .. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest :target: http://python-cluster.readthedocs.org :alt: Documentation Status + + + +2015/07/20 NEW FUNCTIONALITIES FOR HIGH AND LOW DIMENSIONALITY PROBLEMS +======================================================================= +Authors of new added functionalities: + Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com + Ramos Diaz, Juan juanrd0088@gmail.com + +Acknoledgements: + Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research + through "INNPACTO" innovation program IPT-2012-0839-430000. + + +High dimensionality (HD) problems are those which have items with high number of dimensions +There are two types of HD problems: + a) set of items with large number of dimensions + b) set of items with a limited number of dimensions from a large available number of dimensions + For example considering dimensions X, Y, Z, K, L, M and the items: + item1=(X=2, Z=5, L=7) + item2=(X=6, Y=5, M=7) + +The HD problems involves a high cost computation because distance functions in this case takes more +operations than Low dimensionality problems. + +For case "b" (valid also for "a"), a new distance for HD problems is available: HDdistItems() ,HDequals() +This distance function compares dimensions between 2 items. +Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (mahatan style) +if the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2 + +there is no difference with current usage: + + >>>cl = KMeansClustering(users,HDdistItems,HDequals); + + +Additionally, now the number of iterations can be limited in order to save time +Experimentally, we have concluded that 10 iterations is enough accurate for most cases. +The new HDgetClusters() function is linear. Avoid the recalculation of centroids +whereas original function getClusters() is N*N complex, because recalculate the +centroid when move an item from one cluster to another. +This new function can be used for low and high dimensionality problems, increasing +performance in both cases + + >>>solution = cl.HDgetclusters(numclusters,max_iterations); + +Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation. +median is more accurate but involves more computations when N is huge. +The function HDcentroid() is invoked internally by HDgetclusters() + From 2252eaddb395fb32db03659d818e2ffea5ffe701 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 11:59:20 +0200 Subject: [PATCH 02/16] Update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 18e30e1..510c607 100644 --- a/README.rst +++ b/README.rst @@ -69,8 +69,8 @@ Acknoledgements: High dimensionality (HD) problems are those which have items with high number of dimensions There are two types of HD problems: - a) set of items with large number of dimensions - b) set of items with a limited number of dimensions from a large available number of dimensions + a)set of items with large number of dimensions + b)set of items with a limited number of dimensions from a large available number of dimensions For example considering dimensions X, Y, Z, K, L, M and the items: item1=(X=2, Z=5, L=7) item2=(X=6, Y=5, M=7) From f1aeaba45bf6e311e756bcbe50952dbc687bab5f Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:00:27 +0200 Subject: [PATCH 03/16] Update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 510c607..13b2e73 100644 --- a/README.rst +++ b/README.rst @@ -68,8 +68,8 @@ Acknoledgements: High dimensionality (HD) problems are those which have items with high number of dimensions -There are two types of HD problems: - a)set of items with large number of dimensions +There are two types of HD problems:: + a)set of items with large number of dimensions. b)set of items with a limited number of dimensions from a large available number of dimensions For example considering dimensions X, Y, Z, K, L, M and the items: item1=(X=2, Z=5, L=7) From fe70bc25781d38595634a85e9a8e2f14fa785d40 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:01:14 +0200 Subject: [PATCH 04/16] Update README.rst --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 13b2e73..1aca39c 100644 --- a/README.rst +++ b/README.rst @@ -83,7 +83,7 @@ This distance function compares dimensions between 2 items. Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (mahatan style) if the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2 -there is no difference with current usage: +there is no difference with current usage:: >>>cl = KMeansClustering(users,HDdistItems,HDequals); @@ -94,9 +94,9 @@ The new HDgetClusters() function is linear. Avoid the recalculation of centroids whereas original function getClusters() is N*N complex, because recalculate the centroid when move an item from one cluster to another. This new function can be used for low and high dimensionality problems, increasing -performance in both cases +performance in both cases:: - >>>solution = cl.HDgetclusters(numclusters,max_iterations); + >>>solution = cl.HDgetclusters(numclusters,max_iterations) Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation. median is more accurate but involves more computations when N is huge. From 7223010ffea9ce1d83c2b2e7bafdd22ff67a156e Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:38:13 +0200 Subject: [PATCH 05/16] Update README.rst --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 1aca39c..e29dcee 100644 --- a/README.rst +++ b/README.rst @@ -60,6 +60,7 @@ The parameter passed to getclusters is the count of clusters generated. ======================================================================= Authors of new added functionalities: Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com + Ramos Diaz, Juan juanrd0088@gmail.com Acknoledgements: From 25ea0d3adf8617c29ce916e8d4bc5e7b7539f144 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:41:28 +0200 Subject: [PATCH 06/16] Update README.rst --- README.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index e29dcee..7d39b60 100644 --- a/README.rst +++ b/README.rst @@ -59,9 +59,8 @@ The parameter passed to getclusters is the count of clusters generated. 2015/07/20 NEW FUNCTIONALITIES FOR HIGH AND LOW DIMENSIONALITY PROBLEMS ======================================================================= Authors of new added functionalities: - Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com - - Ramos Diaz, Juan juanrd0088@gmail.com + - Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com + - Ramos Diaz, Juan juanrd0088@gmail.com Acknoledgements: Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research @@ -81,10 +80,9 @@ operations than Low dimensionality problems. For case "b" (valid also for "a"), a new distance for HD problems is available: HDdistItems() ,HDequals() This distance function compares dimensions between 2 items. -Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (mahatan style) -if the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2 +Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (manhattan style). If the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2 -there is no difference with current usage:: +There is no difference with current usage:: >>>cl = KMeansClustering(users,HDdistItems,HDequals); From f4d416cd4058a9a52c0895858820b046797ecfb4 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:45:16 +0200 Subject: [PATCH 07/16] Update AUTHORS new contributors --- AUTHORS | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index 5fb6b96..7e53803 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,2 +1,6 @@ Michel Albert (exhuma@users.sourceforge.net) -Sam Sandberg (@LoisaidaSam) \ No newline at end of file +Sam Sandberg (@LoisaidaSam) + +high dimensionality functionalities: +Jose J. GarciaAranda (@jjaranda13) +Juan Ramos Diaz (@juanrd0088) From 56f7da55c28088b4a8dcd537ffb62077dbf53d0b Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:48:14 +0200 Subject: [PATCH 08/16] Update util.py added new function: HDcentroid() --- cluster/util.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cluster/util.py b/cluster/util.py index b45ed09..c23910a 100644 --- a/cluster/util.py +++ b/cluster/util.py @@ -130,3 +130,38 @@ def centroid(data, method=median): for i in range(len(data[0])): out.append(method([x[i] for x in data])) return tuple(out) + +def HDcentroid(data): + dict_words={} + dict_weight={} + words_per_user=10 #10 words per user. This value is not used. + num_users_cluster=len(data)# len(data) is the number of users (user=item) + + for i in range (num_users_cluster): + words_per_user=len(data[i])/2 #each profile have 10 pairs of keyword, weight + for j in range (words_per_user): + word=(data[i])[j*2] + if (dict_words.has_key(word)) : + dict_words[word]+=1 + dict_weight[word]+=data[i][2*j+1] + else : + dict_words[word]=1 + dict_weight[word]=data[i][2*j+1] + #l is a ordered list of the keywords, with the sum of the weight of every popular keyword + l=dict_words.items() + l.sort(key=lambda x:10000000-x[1]) + + words_per_centroid=min(10,len(l)) + + out=[0]*words_per_centroid*2 + centroid_total_weight=0 + + for i in range (words_per_centroid): + tupla=l[i] # word, sum of weights + out[i*2]=tupla[0] + out[i*2+1]=dict_weight[tupla[0]]/tupla[1] + centroid_total_weight+=out[i*2+1] + #normalization of the centroid weight + for i in range(words_per_centroid): + out [i*2+1]=out[i*2+1]/centroid_total_weight + return tuple(out) From 4dad5478c0be242ccec4f3ae3b13769418e794ce Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:54:00 +0200 Subject: [PATCH 09/16] Update kmeans.py --- cluster/method/kmeans.py | 107 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 2 deletions(-) diff --git a/cluster/method/kmeans.py b/cluster/method/kmeans.py index 5a92f02..c16f448 100644 --- a/cluster/method/kmeans.py +++ b/cluster/method/kmeans.py @@ -14,9 +14,12 @@ # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # +# new functions: HDgetCluster() and HDassignItem by: +# 2015 Jose Javier Garcia Aranda, Juan Ramos Diaz - -from cluster.util import ClusteringError, centroid, minkowski_distance +from cluster.util import ClusteringError, centroid, minkowski_distance, HDcentroid +import time +import datetime class KMeansClustering(object): @@ -166,3 +169,103 @@ def initialise_clusters(self, input_, clustercount): for item in input_: self.__clusters[count % clustercount].append(item) count += 1 + + + def HDgetclusters(self, count, max_iterations): + """ + Generates *count* clusters. + + :param count: The amount of clusters that should be generated. count + must be greater than ``1``. + :raises ClusteringError: if *count* is out of bounds. + """ + + # only proceed if we got sensible input + if count <= 1: + raise ClusteringError("When clustering, you need to ask for at " + "least two clusters! " + "You asked for %d" % count) + + # return the data straight away if there is nothing to cluster + if (self.__data == [] or len(self.__data) == 1 or + count == self.__initial_length): + return self.__data + + # It makes no sense to ask for more clusters than data-items available + if count > self.__initial_length: + raise ClusteringError( + "Unable to generate more clusters than " + "items available. You supplied %d items, and asked for " + "%d clusters." % (self.__initial_length, count)) + + self.initialise_clusters(self.__data, count) + + items_moved = True # tells us if any item moved between the clusters, + # as we initialised the clusters, we assume that + # is the case + + iteration=0 + #asi no, no obligar a hacer iteraciones, lo hago segun dice el algoritmo + #pero si llego a iteraciones paro, si termino antes de llegar, mejor + while items_moved is True: + items_moved = False + print "iterating",iteration + ts = time.time() + st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + print st + iteration=iteration+1 + + #computation of centroids + my_centroids={} # new!! + for cluster in self.__clusters:# new!! + one_centroid=HDcentroid(cluster)# new!! + my_centroids[one_centroid]=cluster # new!! + + + + + #this few lines are new: + #print centroids . it works, for debug purposes only!! + #for i in my_centroids.keys(): + # print "key:",i # print the centroid!! + # print "value:",my_centroids[i] # print all elements of the cluster!! + #print my_centroids.keys()[0] # imprime el primer centroide. es una prueba + + #now we scan the N items without recalculation of centroids. Therefore, it is linear + for cluster in self.__clusters: + for centroid_aux, cluster_aux in my_centroids.iteritems(): + if cluster_aux == cluster: + centroid_cluster=centroid_aux + break; + for item in cluster: + res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)#modified!! + if items_moved is False: + items_moved = res + + if (iteration == max_iterations): + items_moved = False + return self.__clusters + + + def HDassign_item(self, item, origin, origin_centroid, my_centroids): + """ + Assigns an item from a given cluster to the closest located cluster. + + :param item: the item to be moved. + :param origin: the originating cluster. + :param origin_centroid: centroid of the originating cluster + :my_centroids: dictionary of centroid,cluster + """ + closest_cluster=origin #my_centroids[closest_centroid]=closest_cluster + closest_centroid=origin_centroid + #for cluster in self.__clusters: + for centro in my_centroids.keys(): + if self.distance(item, centro) < self.distance( + item, closest_centroid): + closest_cluster = my_centroids[centro] + + if id(closest_cluster) != id(origin): + self.move_item(item, origin, closest_cluster) + return True + else: + return False From 55ae158395748c00d4f5bd116e562f77e4abc6be Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 12:54:56 +0200 Subject: [PATCH 10/16] Update util.py --- cluster/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster/util.py b/cluster/util.py index c23910a..47a841d 100644 --- a/cluster/util.py +++ b/cluster/util.py @@ -14,6 +14,8 @@ # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # +# new functions HDcentroid() by: +# 2015 Jose Javier Garcia Aranda , Juan Ramos Diaz from __future__ import print_function import logging From 2ae8e4b5bd39b4ce1472fa1aa975794aab99629d Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 13:00:23 +0200 Subject: [PATCH 11/16] Create HDdistances.py This file provides functionalities for High dimensionality problems but also for low dimensionality problems --- cluster/HDdistances.py | 71 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 cluster/HDdistances.py diff --git a/cluster/HDdistances.py b/cluster/HDdistances.py new file mode 100644 index 0000000..b8c8e68 --- /dev/null +++ b/cluster/HDdistances.py @@ -0,0 +1,71 @@ + +# This file provides functionalities for High dimensionality problems but also for low dimensionality problems +# - New Distance computation +# - SSE metric computation for assist the computation of the optimal number of clusters +# +# Authors: +# Jose Javier Garcia Aranda +# Juan Ramos Diaz + + + +#from cluster import KMeansClustering +#import KMeansClustering +#import ClusteringError +import util +import time +import datetime + +import random + +def HDdistItems(profile1,profile2): + #Distance function, this distance between two profiles is based on: + #For each keyword of user A, if the keyword is not present in user B , then the distance for this keyword is the weight in the user A. + #If the keyword exists in both users, the weights are compared and the distance is the absolute difference + len1=len(profile1)/2 + len2=len(profile2)/2 + total_len=len1+len2 #this value usually is 20 + factor_len=20/total_len #this only work if the profile has less than 10 keys + distance = 0.0 + marked=[0]*20; + for i in range(len1): + found=False + for j in range(len2): + if profile1[i*2]==profile2[j*2]: + distance+=abs(profile1[i*2+1]-profile2[j*2+1]); + found=True; + marked[j*2]=1; + break; + if found==False: + distance+=profile1[i*2+1]; + + for i in range(len2): + if marked[i*2]==1: + continue; + distance+=profile2[i*2+1] + + distance=distance*factor_len + return distance + +def HDequals(profile1,profile2): + for i in range(10): + for j in range(10): + if profile1[i*2]!=profile2[j*2]: + return False + elif profile1[i*2+1]!=profile2[j*2+1]: + return False + return True + #return True; + +def HDcomputeSSE(solution,numclusters): + #This metric measure the cohesion of users into a cluster and the separation among clusters at the same time + partial_solution=0 + total_solution=0 + dist=0 + for i in range(numclusters): + partial_solution=0 + for j in solution[i]: + dist=HDdistItems(util.HDcentroid(solution[i]),j) + partial_solution+=dist*dist + total_solution+=partial_solution + return total_solution From ee2e62cd568dc1c34c42ebd44580f662c208ce2d Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 13:11:07 +0200 Subject: [PATCH 12/16] Create HDexample.py --- HDexample.py | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 HDexample.py diff --git a/HDexample.py b/HDexample.py new file mode 100644 index 0000000..724f90f --- /dev/null +++ b/HDexample.py @@ -0,0 +1,151 @@ +# -*- coding: cp1252 -*- +#from cluster import KMeansClustering +from cluster import KMeansClustering +from cluster import ClusteringError +from cluster import util +from cluster.util import HDcentroid +from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE +import time +import datetime + +import random + +############################################################################### +# High Dimensionality problem example # +############################################################################### +# This High Dimensionality example creates N items (which are "users"). +# Each user is defined by his profile. +# A profile is a tuple of 10 pairs of keyword and weight ( 20 fields in total) +# weights are floating numbers and belong to 0..1 +# The summation of weights of a profile is normalized to 1 +# we consider 1000 diferent keywords +# A profile takes 8 keywords from first 200 keywords (the "popular" keywords) +# Each keyword is a dimension. Therefore there are 1000 possible dimensions +# A single user only have 10 dimensions +# Different users can have different dimensions. +# A new distance and equality function are defined for this use case +# +# cl = KMeansClustering(users,HDdistItems,HDequals); +# +# Additionally, now the number of iterations can be limited in order to save time +# Experimentally, we have concluded that 10 iterations is enough accurate for most cases. +# The new HDgetClusters() function is linear. Avoid the recalculation of centroids +# whereas original function getClusters() is N*N complex, because recalculate the +# centroid when move an item from one cluster to another. +# This new function can be used for low and high dimensionality problems, increasing +# performance in both cases +# +# solution = cl.HDgetclusters(numclusters,max_iterations); +# +# Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation. +# median is more accurate but involves more computations when N is huge. +# The function HDcentroid() is invoked internally by HDgetclusters() +# +# The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters. +# + + + +def createProfile(): + num_words=1000 + total_weight=0; + marked_word=[0]*num_words + repeated_word=False + list_profile=[] + returned_profile=(); + profile_aux=[]; + #10 pairs word, weight. + #Don't repeated words. + for i in range(8): + partial_weight=random.uniform(0,1) + total_weight+=partial_weight + repeated_word=False + while repeated_word==False: + random_word=random.randint(0,299) + if marked_word[random_word]==0: + marked_word[random_word]=1 + repeated_word=True + random_word= str(random_word) + tupla=[random_word,partial_weight] + list_profile.append(tupla) + for i in range(2): + partial_weight=random.uniform(0,1) + total_weight+=partial_weight + repeated_word=False + while repeated_word==False: + random_word=random.randint(300,999) + if marked_word[random_word]==0: + marked_word[random_word]=1 + repeated_word=True + random_word= str(random_word) + tupla=[random_word,partial_weight] + list_profile.append(tupla) + #Normalization of the profile + for i in range(5): + a=list_profile[i][0] + b=list_profile[i][1] + b=b/total_weight; #the sum of the weights must be 1 + profile_aux=([a,b]) + returned_profile+=tuple(profile_aux) + return returned_profile + +#################################################### +# MAIN # +#################################################### +sses=[0]*10 # stores the sse metric for each number of clusters from 5 to 50 +num_users=1000 +numsse=0 +numclusters=5 # starts at 5 +max_iteraciones=10 # for efficiency we limit the number of kmeans iterations +ts = time.time() +start_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') +while numclusters<=25: + supersol=0#supersolucion, dist between clusters centroids and items. + + usuarios=[] + for i in range(num_users):#en el range el numero de usuarios + usuario = createProfile() + #print usuario + #print i + usuarios.append(usuario) + #print distUsers(usuarios[0],usuario) + #numclusters=6 #linea para pruebas + + #print distUsers(usuarios[0],usuarios[1]) + #print usuarios + x=0; + print " initializing cluster..." + + cl = KMeansClustering(usuarios,HDdistItems,HDequals); + print " clusterizando...",numclusters + ts = time.time() + st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + print st + numclusters=numclusters + solucion = cl.HDgetclusters(numclusters,max_iteraciones); + #print x + #print "--------------------------------------------"; + + #print solucion[0]; + for i in range(numclusters): + #print "====="+str(i); + a = solucion[i] + print util.HDcentroid(a),"," + ts = time.time() + st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + #print st + #supersol += HDcomputeSSE(solucion,numclusters) + + sses[numsse]=supersol + numsse+=1 + numclusters+=5 +ts = time.time() +horafin=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') +print "inicio:",horainicio +print "fin:",horafin +print "sses:",sses +f=open("resul2.txt","w") +f.write("sses:") +f.write(str(sses)) +f.write("\n") +f.close() From f275e9df4a56458f8ec6e155a84cd965ee7416e7 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 20 Jul 2015 15:06:50 +0200 Subject: [PATCH 13/16] Update HDexample.py a High dimensionality example --- HDexample.py | 86 ++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 53 deletions(-) diff --git a/HDexample.py b/HDexample.py index 724f90f..96e90d6 100644 --- a/HDexample.py +++ b/HDexample.py @@ -1,17 +1,9 @@ # -*- coding: cp1252 -*- -#from cluster import KMeansClustering -from cluster import KMeansClustering -from cluster import ClusteringError -from cluster import util -from cluster.util import HDcentroid -from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE -import time -import datetime - -import random - ############################################################################### -# High Dimensionality problem example # +# High Dimensionality problem example +# Authors: +# 2015 Jose Javier Garcia Aranda , Juan Ramos Diaz +# ############################################################################### # This High Dimensionality example creates N items (which are "users"). # Each user is defined by his profile. @@ -43,8 +35,16 @@ # # The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters. # +# +from cluster import KMeansClustering +from cluster import ClusteringError +from cluster import util +from cluster.util import HDcentroid +from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE +import time +import datetime - +import random def createProfile(): num_words=1000 @@ -92,60 +92,40 @@ def createProfile(): #################################################### # MAIN # #################################################### -sses=[0]*10 # stores the sse metric for each number of clusters from 5 to 50 -num_users=1000 +sses=[0]*10 #stores the sse metric for each number of clusters from 5 to 50 +num_users=100 numsse=0 numclusters=5 # starts at 5 -max_iteraciones=10 # for efficiency we limit the number of kmeans iterations +max_iteraciones=10 ts = time.time() start_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') -while numclusters<=25: - supersol=0#supersolucion, dist between clusters centroids and items. - - usuarios=[] +while numclusters<=50: # compute SSE from num_clusters=5 to 50 + supersol=0#supersolucion, distancias entre el clusters y los usuarios. + users=[] # users are the items of this example for i in range(num_users):#en el range el numero de usuarios - usuario = createProfile() - #print usuario - #print i - usuarios.append(usuario) - #print distUsers(usuarios[0],usuario) - #numclusters=6 #linea para pruebas - - #print distUsers(usuarios[0],usuarios[1]) - #print usuarios - x=0; - print " initializing cluster..." - - cl = KMeansClustering(usuarios,HDdistItems,HDequals); - print " clusterizando...",numclusters + user = createProfile() + users.append(user) + #x=0; + print " inicializing kmeans..." + cl = KMeansClustering(users,HDdistItems,HDequals); + print " executing...",numclusters ts = time.time() st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print st numclusters=numclusters - solucion = cl.HDgetclusters(numclusters,max_iteraciones); - #print x - #print "--------------------------------------------"; - - #print solucion[0]; + solution = cl.HDgetclusters(numclusters,max_iteraciones); for i in range(numclusters): - #print "====="+str(i); - a = solucion[i] + a = solution[i] print util.HDcentroid(a),"," ts = time.time() st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') - #print st - #supersol += HDcomputeSSE(solucion,numclusters) - - sses[numsse]=supersol + + sses[numsse]=HDcomputeSSE(solution,numclusters) numsse+=1 numclusters+=5 ts = time.time() -horafin=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') -print "inicio:",horainicio -print "fin:",horafin +end_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') +print "start_time:",start_time +print "end_time:",end_time print "sses:",sses -f=open("resul2.txt","w") -f.write("sses:") -f.write(str(sses)) -f.write("\n") -f.close() + From 4b58056d868d4367d6c374b89cc3a252d5e5ce79 Mon Sep 17 00:00:00 2001 From: juanrd0088 Date: Mon, 20 Jul 2015 16:07:29 +0200 Subject: [PATCH 14/16] Update HDdistances.py bug en HddistItems: factor_len no era un numero real y no funcionaba. --- cluster/HDdistances.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster/HDdistances.py b/cluster/HDdistances.py index b8c8e68..f5d3dd7 100644 --- a/cluster/HDdistances.py +++ b/cluster/HDdistances.py @@ -25,9 +25,9 @@ def HDdistItems(profile1,profile2): len1=len(profile1)/2 len2=len(profile2)/2 total_len=len1+len2 #this value usually is 20 - factor_len=20/total_len #this only work if the profile has less than 10 keys + factor_len=20.0/total_len #this only work if the profile has less than 10 keys distance = 0.0 - marked=[0]*20; + marked=[0]*(total_len*2); for i in range(len1): found=False for j in range(len2): From 4340c79ccb1d50848dc1908cbb2fc5151382b1b6 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Mon, 18 Jun 2018 16:43:54 +0200 Subject: [PATCH 15/16] solved pull request issues --- HDexample.py | 43 +++++++++++++++++------------------- README.rst | 15 +++++-------- cluster/HDdistances.py | 47 +++++++++++++++++++++------------------- cluster/method/kmeans.py | 37 ++++++++++++++----------------- cluster/util.py | 14 +++++++----- 5 files changed, 75 insertions(+), 81 deletions(-) diff --git a/HDexample.py b/HDexample.py index 96e90d6..f631513 100644 --- a/HDexample.py +++ b/HDexample.py @@ -36,17 +36,19 @@ # The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters. # # +from __future__ import print_function from cluster import KMeansClustering from cluster import ClusteringError from cluster import util from cluster.util import HDcentroid -from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE +from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE, HD_profile_dimensions + import time import datetime - import random def createProfile(): + """create a profile composed of 10 dimensions chosen from 1000 dimensions""" num_words=1000 total_weight=0; marked_word=[0]*num_words @@ -55,6 +57,7 @@ def createProfile(): returned_profile=(); profile_aux=[]; #10 pairs word, weight. + HD_profile_dimensions=10 #Don't repeated words. for i in range(8): partial_weight=random.uniform(0,1) @@ -96,36 +99,30 @@ def createProfile(): num_users=100 numsse=0 numclusters=5 # starts at 5 -max_iteraciones=10 -ts = time.time() -start_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') +max_iterations=10 +start_time=datetime.datetime.now() while numclusters<=50: # compute SSE from num_clusters=5 to 50 - supersol=0#supersolucion, distancias entre el clusters y los usuarios. users=[] # users are the items of this example - for i in range(num_users):#en el range el numero de usuarios + for i in range(num_users): user = createProfile() users.append(user) - #x=0; - print " inicializing kmeans..." + print (" inicializing kmeans...") cl = KMeansClustering(users,HDdistItems,HDequals); - print " executing...",numclusters - ts = time.time() - st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') - print st + print (" executing...",numclusters) + st=datetime.datetime.now() + print (st) numclusters=numclusters - solution = cl.HDgetclusters(numclusters,max_iteraciones); + solution = cl.HDgetclusters(numclusters,max_iterations); for i in range(numclusters): a = solution[i] - print util.HDcentroid(a),"," - ts = time.time() - st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') - + print (util.HDcentroid(a),",") + st=datetime.datetime.now() + sses[numsse]=HDcomputeSSE(solution,numclusters) numsse+=1 numclusters+=5 -ts = time.time() -end_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') -print "start_time:",start_time -print "end_time:",end_time -print "sses:",sses +end_time=datetime.datetime.now() +print ("start_time:",start_time) +print ("end_time:",end_time) +print ("sses:",sses) diff --git a/README.rst b/README.rst index 7d39b60..0121219 100644 --- a/README.rst +++ b/README.rst @@ -59,24 +59,21 @@ The parameter passed to getclusters is the count of clusters generated. 2015/07/20 NEW FUNCTIONALITIES FOR HIGH AND LOW DIMENSIONALITY PROBLEMS ======================================================================= Authors of new added functionalities: - - Garcia Aranda, Jose Javier jose_javier.garcia_aranda@alcatel-lucent.com + - Garcia Aranda, Jose Javier jjaranda13@gmail.com - Ramos Diaz, Juan juanrd0088@gmail.com Acknoledgements: - Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research - through "INNPACTO" innovation program IPT-2012-0839-430000. - + Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research through "INNPACTO" innovation program IPT-2012-0839-430000. High dimensionality (HD) problems are those which have items with high number of dimensions -There are two types of HD problems:: +There are two types of HD problems: a)set of items with large number of dimensions. - b)set of items with a limited number of dimensions from a large available number of dimensions - For example considering dimensions X, Y, Z, K, L, M and the items: + b)set of items with a limited number of dimensions from a large available number of dimensions:: + For example considering dimensions X, Y, Z, K, L, M and the items item1=(X=2, Z=5, L=7) item2=(X=6, Y=5, M=7) -The HD problems involves a high cost computation because distance functions in this case takes more -operations than Low dimensionality problems. +The HD problems involves a high cost computation because distance functions in this case takes more operations than Low dimensionality problems. For case "b" (valid also for "a"), a new distance for HD problems is available: HDdistItems() ,HDequals() This distance function compares dimensions between 2 items. diff --git a/cluster/HDdistances.py b/cluster/HDdistances.py index f5d3dd7..0bff9a4 100644 --- a/cluster/HDdistances.py +++ b/cluster/HDdistances.py @@ -1,31 +1,33 @@ -# This file provides functionalities for High dimensionality problems but also for low dimensionality problems -# - New Distance computation -# - SSE metric computation for assist the computation of the optimal number of clusters -# -# Authors: -# Jose Javier Garcia Aranda -# Juan Ramos Diaz +""" This file provides functionalities for High dimensionality problems but also for low dimensionality problems + added functionalities: + - New Distance computation + - SSE metric computation for assist the computation of the optimal number of clusters - -#from cluster import KMeansClustering -#import KMeansClustering -#import ClusteringError + Authors: + Jose Javier Garcia Aranda + Juan Ramos Diaz +""" import util import time import datetime - import random +HD_profile_dimensions=10 #dimensions per profile, default value is 10 + def HDdistItems(profile1,profile2): - #Distance function, this distance between two profiles is based on: - #For each keyword of user A, if the keyword is not present in user B , then the distance for this keyword is the weight in the user A. - #If the keyword exists in both users, the weights are compared and the distance is the absolute difference - len1=len(profile1)/2 - len2=len(profile2)/2 + """Distance function, this distance between two profiles is defined as: + For each keyword of user A, if the keyword is not present in user B , then the distance for this keyword is the weight in the user A. + If the keyword exists in both users, the weights are compared and the distance is the absolute difference. + For each keyword present in the union of keywords of both profiles, the distance is computed and added to the total distance between both users + """ + + len1=len(profile1)/2 # len(profile1) is always pair because each dimension has a weight + len2=len(profile2)/2 # len(profile2) is always pair because each dimension has a weight total_len=len1+len2 #this value usually is 20 - factor_len=20.0/total_len #this only work if the profile has less than 10 keys + #factor_len=20.0/total_len #this only work if the profile has less than 10 keys + factor_len=2.0*HD_profile_dimensions/total_len #this only work if the profile has less than 10 keys distance = 0.0 marked=[0]*(total_len*2); for i in range(len1): @@ -48,17 +50,18 @@ def HDdistItems(profile1,profile2): return distance def HDequals(profile1,profile2): - for i in range(10): - for j in range(10): + for i in range(HD_profile_dimensions): + for j in range(HD_profile_dimensions): if profile1[i*2]!=profile2[j*2]: return False elif profile1[i*2+1]!=profile2[j*2+1]: return False return True - #return True; + def HDcomputeSSE(solution,numclusters): - #This metric measure the cohesion of users into a cluster and the separation among clusters at the same time + """This metric measure the cohesion of users into a cluster and the separation among clusters at the same time""" + partial_solution=0 total_solution=0 dist=0 diff --git a/cluster/method/kmeans.py b/cluster/method/kmeans.py index c16f448..a03996a 100644 --- a/cluster/method/kmeans.py +++ b/cluster/method/kmeans.py @@ -205,31 +205,26 @@ def HDgetclusters(self, count, max_iterations): # is the case iteration=0 - #asi no, no obligar a hacer iteraciones, lo hago segun dice el algoritmo - #pero si llego a iteraciones paro, si termino antes de llegar, mejor + + #The number of iterations is limited to max_iterations. When this limit is reached, the items_moved is forced to false while items_moved is True: items_moved = False - print "iterating",iteration - ts = time.time() - st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') - print st + #print "iterating",iteration # for debug purposes + st=datetime.datetime.now() + # print st # for debug purposes iteration=iteration+1 #computation of centroids my_centroids={} # new!! - for cluster in self.__clusters:# new!! - one_centroid=HDcentroid(cluster)# new!! - my_centroids[one_centroid]=cluster # new!! - - - + for cluster in self.__clusters: + one_centroid=HDcentroid(cluster) + my_centroids[one_centroid]=cluster - #this few lines are new: #print centroids . it works, for debug purposes only!! #for i in my_centroids.keys(): # print "key:",i # print the centroid!! # print "value:",my_centroids[i] # print all elements of the cluster!! - #print my_centroids.keys()[0] # imprime el primer centroide. es una prueba + #print my_centroids.keys()[0] # print the fist centroid. for testing #now we scan the N items without recalculation of centroids. Therefore, it is linear for cluster in self.__clusters: @@ -238,7 +233,7 @@ def HDgetclusters(self, count, max_iterations): centroid_cluster=centroid_aux break; for item in cluster: - res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)#modified!! + res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids) if items_moved is False: items_moved = res @@ -256,15 +251,15 @@ def HDassign_item(self, item, origin, origin_centroid, my_centroids): :param origin_centroid: centroid of the originating cluster :my_centroids: dictionary of centroid,cluster """ - closest_cluster=origin #my_centroids[closest_centroid]=closest_cluster + closest_cluster=origin closest_centroid=origin_centroid - #for cluster in self.__clusters: - for centro in my_centroids.keys(): - if self.distance(item, centro) < self.distance( + + for center in my_centroids.keys(): + if self.distance(item, center) < self.distance( item, closest_centroid): - closest_cluster = my_centroids[centro] + closest_cluster = my_centroids[center] - if id(closest_cluster) != id(origin): + if closest_cluster is not origin: self.move_item(item, origin, closest_cluster) return True else: diff --git a/cluster/util.py b/cluster/util.py index 47a841d..3d04e37 100644 --- a/cluster/util.py +++ b/cluster/util.py @@ -19,6 +19,7 @@ from __future__ import print_function import logging +from HDdistances import HD_profile_dimensions logger = logging.getLogger(__name__) @@ -136,11 +137,10 @@ def centroid(data, method=median): def HDcentroid(data): dict_words={} dict_weight={} - words_per_user=10 #10 words per user. This value is not used. - num_users_cluster=len(data)# len(data) is the number of users (user=item) + num_users_cluster=len(data)# len(data) is the number of items for i in range (num_users_cluster): - words_per_user=len(data[i])/2 #each profile have 10 pairs of keyword, weight + words_per_user=len(data[i])/2 #each profile have pairs of keywords, weight for j in range (words_per_user): word=(data[i])[j*2] if (dict_words.has_key(word)) : @@ -149,11 +149,13 @@ def HDcentroid(data): else : dict_words[word]=1 dict_weight[word]=data[i][2*j+1] - #l is a ordered list of the keywords, with the sum of the weight of every popular keyword + #l is a non-ordered list of the keywords, with the sum of the weight of every popular keyword l=dict_words.items() + #l is going to be converted into an ordered list of the keywords by popularity + # this sort() invocation works if the number of dimensions is less than 10000000 l.sort(key=lambda x:10000000-x[1]) - - words_per_centroid=min(10,len(l)) + + words_per_centroid=min(HD_profile_dimensions,len(l)) out=[0]*words_per_centroid*2 centroid_total_weight=0 From 5d818c8a64efd4fa988a93765f53df9ca94fbdc1 Mon Sep 17 00:00:00 2001 From: jjaranda13 Date: Fri, 6 Jul 2018 09:36:02 +0200 Subject: [PATCH 16/16] exuma request --- cluster/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster/util.py b/cluster/util.py index 3d04e37..fde343c 100644 --- a/cluster/util.py +++ b/cluster/util.py @@ -19,7 +19,7 @@ from __future__ import print_function import logging -from HDdistances import HD_profile_dimensions +from .HDdistances import HD_profile_dimensions logger = logging.getLogger(__name__)