#!/usr/sfw/bin/python
# -*- coding: iso-8859-15 -*-
# C:\Python27\python.exe [adresse du fichier PartitionDistance.py]\PartitionDistance.py ProjetInfoling2012PartitionDeReference.txt ProjetInfoling2012PartitionExemple.txt

import sys, os, re, string, time
from math import *

#------------------------------
# Partition Distance
#------------------------------
# Two partition files given as parameters:
# One element per line,
# elements in the same cluster appear on consecutive lines,
# clusters are separated by empty lines.
#------------------------------

def openPartition(filename,format):
        fd = open(filename,"r")
        parti=[]
        lines = fd.readlines()

        # One element per line,
        # elements in the same cluster appear on consecutive lines,
        # clusters are separated by empty lines.
        set=[]
        for line in lines:
             # clean line:
             res2=re.search("^([^\n]+)[\n]*",line)
             if res2 :
                 line=res2.group(1)
                 
             if ((len(line)<1) or (line==" ") or (line=="\n")) and (len(set)>0):
                 # add the set to the partition and start a new one:
                 parti.append(set)
                 set=[]
             else:                     
                 # append an element to the set:
                 set.append(line)
        if len(set)>0:
                 parti.append(set)
                 set=[]
        fd.close()
        return parti


# partition given as a list of lists
def ClustersToDico(partition):
        parti = {}
        i=0
        for set in partition:
            for element in set:
                parti[element]=i
            i+=1
        #print len(parti.keys())
        return parti


# part1 and part2: dico associating to each element its class number
# parti1 and parti2: lists of clusters (cluster=list of elements)
# When both partitions don't have the same number of elements, add singletons
def completePartitions(part1,part2,parti1,parti2):
        for element in part1:
             if not(part2.has_key(element)):
                    part2[element]=len(parti2)
                    newElement=[]
                    newElement.append(element)
                    parti2.append(newElement)

        
# part1 and part2: dico associating to each element its class number
# parti1 and parti2: lists of clusters (cluster=list of elements)
# http://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index
def adjustedRand(part1,part2,parti1,parti2):
        distance=0
        matrix=[]
        i=0
        while i<len(parti1):
            i+=1
            j=0
            matrixLine=[]
            while j<len(parti2):
                 j+=1
                 matrixLine.append(0)
            matrix.append(matrixLine)
        for element in part1:
                 #print part1[element],part2[element]
                 matrix[part1[element]][part2[element]]+=1
        i=0
        n=len(part1)
        a=[]
        b=[]
        selem=0
        for matrixLine in matrix:
                 total=0
                 j=0
                 for element in matrixLine:
                       if len(b)<=j:
                           b.append(element)
                       else:
                           b[j]+=element
                       total+=element
                       if element>0:
                           selem+=element*(element-1)/2
                       j+=1
                 #print "compar",total,len(parti1[i])
                 a.append(total)
                 i+=1
        #print matrix
        srow=0
        scol=0
        for element in a:
                 if element>0:
                       srow+=element*(element-1)/2
        for element in b:
                 if element>0:
                       scol+=element*(element-1)/2
        expected=(srow*scol)*2.0/(1.0*n*(n-1))
        #print selem," - ",expected, "-",srow,"/",scol,"n",n
        index=(selem-expected)/(0.5*(srow+scol)-expected)
        return 1-index


# part1 and part2: dico associating to each element its class number
# parti1 and parti2: lists of clusters (cluster=list of elements)
# http://en.wikipedia.org/wiki/Rand_index#Rand_index
def Rand(part1,part2,parti1,parti2):
        aa=0
        bb=0
        cc=0
        dd=0
        elements=part1.keys()
        i=0
        while i<len(elements):
            j=i+1
            while j<len(elements):
                 if part1[elements[i]]==part1[elements[j]]:
                     if part2[elements[i]]==part2[elements[j]]:
                         aa+=1
                     else :
                         bb+=1
                 else :
                     if part2[elements[i]]==part2[elements[j]]:
                         cc+=1
                     else :
                         dd+=1
                 j+=1
            i+=1
        
        index=(aa+dd)/(1.0*(aa+bb+cc+dd))
        return 1-index
        

#------------------------------
# Getting the parameters
#------------------------------
file1=""
file2=""
if len(sys.argv)<3:
   print "Please add as parameters two file names corresponding to partition files."
else:
   file1=sys.argv[1]
   file2=sys.argv[2]

if not(os.path.isfile(file1)):
   print "First file ("+file1+") does not exist."
else:
   if not(os.path.isfile(file2)):
      print "Second file ("+file2+") does not exist."
   else:


#------------------------------
# Loading the files
#------------------------------
      # part1 and part2: dico associating to each element its class number
      # parti1 and parti2: lists of clusters (cluster=list of elements)

      parti1 = openPartition(file1,"clas")
      parti2 = openPartition(file2,"clas")
      part1=ClustersToDico(parti1)
      part2=ClustersToDico(parti2)
      
      #print parti1
      #print part1
      #print parti2
      #print part2

      print "Partition 1:",len(parti1)," classes - ", len(part1), " elements"
      print "Partition 2:",len(parti2)," classes - ", len(part2), " elements"
      #completePartitions(part1,part2,parti1,parti2)
      #completePartitions(part2,part1,parti2,parti1)
      #print "Partition 1:",len(parti1)," classes - ", len(part1), " elements"
      #print "Partition 2:",len(parti2)," classes - ", len(part2), " elements"

#------------------------------
# Computing partition distances
#------------------------------
      print "Adjusted Rand dissimilarity:",adjustedRand(part1,part2,parti1,parti2)
      print "Rand dissimilarity:",Rand(part1,part2,parti1,parti2)