import string, sys, os, glob, os.path, time, math, HTMLgen import config, re import cgi ClutoParamMap = { 'clmethod': ['Cluster Method', {'rb':'Repeated Bisection', 'rbr':'Repeated Bisection K Way', 'direct': 'Direct', 'agglo': 'Agglomerative', 'graph': 'Graph'}], 'sim': ['Similarity Function', {'cos': 'Cosine', 'corr': 'Correlation Coefficient', 'dist': 'Euclidean Distance', 'jacc': 'Jaccard Coefficient'}], 'crfun': ['Criterion Function', {'i1':'I1', 'i2':'I2', 'e1':'E1', 'g1':'G1', 'g1p':"G1'", 'h1':'H1', 'h2':'H2', 'slink':'Single Link', 'wslink':'Wt Single Link', 'clink':'Complete Link', 'wclink':'Wt Comlete Link', 'upgma':'UPGMA'}], 'cstype': ['Cluster Selection', {'best':'Best', 'large':'Large', 'largest':'Largest'}], 'rowmodel': ['RowModel', {'none':'None', 'maxtf':'MAXTF', 'sqrt':'Square Root', 'log':'Log'}], 'colprune': ['Column Prune'], 'grmodel': ['Graph Model', {'sd':'Symmetric-Direct', 'ad':'Asymmetric-Direct', 'sl':'Symmetric-Link', 'al':'Asymmetric-Link', 'none':'None'}], 'edgeprune': ['Edge Prune'], 'vtxprune': ['Vertex Prune'], 'mincomponent': ['Minimum Components'], 'ntrials': ['Number of Trials'], 'niter': ['Number of Iterations'], 'numclusters': ['Number of Clusters'] } class ClutoError(Exception): def getUploadStr(self, qMap): return 'Please Upload the file again.' def getClusterStr(self, qMap): return 'Please Cluster again.' def __init__(self, type, args): self.type = type self.args = args if self.type == 'UploadGreaterFileSize': self.title = 'Upload File Size > sizeLimit' self.details = 'File size of uploaded file exceeds the file' + \ 'size limit of %dKb.'%args[0] + \ self.getUploadStr(self.args[1]) elif self.type == 'UploadIncorrectFields': self.title = 'Incorrect Number of Fields' self.details = 'Incorrect number of lines in uploaded file %s '%args[0] + \ 'on line %d.\n
Incorrect Line:

%s\n

'%(args[1], args[2]) + \ '

' + self.getUploadStr(self.args[3]) elif self.type == 'UploadFileExists': self.title = 'Dataset with that name exists' self.details = 'Dataset: %s exists.

'%self.args[0] + \ self.getUploadStr(self.args[1]) elif self.type == 'UploadEmptyFile': self.title = 'Incorrect File Format' self.details = 'Uploaded file:%s has not data rows.

'%self.args[0] +\ self.getUploadStr(self.args[1]) elif self.type == 'UploadIncorrectFileName': self.title = 'Incorrect File Name' self.details = 'Uploaded file:%s has white space characters.

'%self.args[0] +\ self.getUploadStr(self.args[1]) elif self.type == 'UploadTextFound': self.title = 'Number format incorrect' self.details = 'Incorrect Text uploaded file %s '%args[0] + \ 'on line %d.\n
Incorrect Line:

%s\n

'%(args[1], args[2]) + \ '

' + self.getUploadStr(self.args[3]) elif self.type == 'ClusterIncorrectOption': self.title = 'Incorrect Options for Clustering' self.details = self.args[0] + '\n

'+ self.getClusterStr('') elif self.type == 'ClusterSolutionExists': self.title = 'Clustering Solutions Exists' self.details = 'Clustering Solution with that name %s exists.'%self.args[0] +\ '

Please cluster with diffferent name.' elif self.type == 'ClusterNoDelimiter': self.title = 'No Delimiters Selected' self.details = 'Please Select a Delimiter' elif self.type == 'ClusterClutoFailed': self.title = 'Clustering Failed' self.details = 'Error Message: %s'%(self.args[0]) else: print self.type raise Error('Unknown;...') #end class SessionObject: def __init__(self, qMap): """ Assumes that the directory for that session exists. Builds the session object from that directory. """ self.sesDir = os.path.join(config.ClutoDataDir, qMap['sesID']) self.datasets = [] self.solutions = [] for dName in glob.glob(self.sesDir+'/*'): assert(os.path.isdir(dName)) self.datasets.append(os.path.split(dName)[1]) solList = [] for fName in glob.glob(os.path.join(self.sesDir, dName)+'/*'): if os.path.isdir(fName): solList.append(os.path.split(fName)[1]) #end for self.solutions.append(solList) #end for assert(len(self.datasets) == len(self.solutions)) #end def #end class def getQueryMap(): queryList = cgi.parse_qsl(os.environ['QUERY_STRING']) qMap = {} for (key, value) in queryList: qMap[key] = value return qMap def getSessionObject(qMap): assert('sesID' in qMap) sesDir = os.path.join(config.ClutoDataDir, qMap['sesID']) if not os.path.exists(sesDir): os.mkdir(sesDir, config.DirCreationMode) #endif return SessionObject(qMap) #end def def getClutoParamList(clutoMap): pList = ['clmethod', 'sim', 'crfun', 'rowmodel'] clmethod = clutoMap['clmethod'] if (clmethod == 'rb') or (clmethod == 'rbr'): pList.extend(['cstype', 'ntrials', 'niter']) elif (clmethod == 'direct'): pList.extend(['ntrials', 'niter']) elif (clmethod == 'agglo'): pass elif (clmethod == 'graph'): grList = ['grmodel', 'edgeprune', 'vtxprune'] pList.extend(['cstype', 'ntrials', 'nnbrs']) pList.extend(grList) else: pass #end ## make sure the numClusters is a +ve integer if re.compile('\D+').search(clutoMap['numclusters']): errStr = 'Number of Clusters has to be a positive value not: %s'%clutoMap['numclusters'] raise ClutoError('ClusterIncorrectOption', [errStr]) #endif if int(clutoMap['numclusters']) > int(clutoMap['numrows']): errStr = 'Number of Clusters:%s has to be a less than Number of rows: %s in dataset'%(clutoMap['numclusters'], clutoMap['numrows']) raise ClutoError('ClusterIncorrectOption', [errStr]) #endif if clmethod <> 'graph': if clutoMap['sim'] in ['jacc', 'dist']: sList = ClutoParamMap['sim'] simStr = sList[1][clutoMap['sim']] errStr = '%s=%s can be used only with Cluster Method: Graph'%(sList[0], simStr) raise ClutoError('ClusterIncorrectOption', [errStr]) #endif if clmethod <> 'agglo': aggCRList = ['slink', 'wslink', 'clink', 'wclink', 'upgma'] if clutoMap['crfun'] in aggCRList: cList = ClutoParamMap['crfun'] crStr = cList[1][clutoMap['crfun']] errStr = '%s=%s can be used only with Cluster Method: Agglomerative'%(cList[0], crStr) raise ClutoError('ClusterIncorrectOption', [errStr]) #endif return pList #end ClutoParamList = ['niter', 'edgeprune', 'grmodel', 'clmethod', 'rowmodel', 'sim', 'vtxprune', 'ntrials', 'crfun', 'cstype', 'nnbrs', 'numrows'] def getClutoMap(form): solutionName = form.has_key('solName') and form['solName'].value or '' clutoMap = {} for param in ClutoParamList: clutoMap[param] = form[param].value #end for clutoMap['numclusters'] = form['numclusters'].value return (clutoMap,solutionName) def getBoundingBox(epsFile): for line in file(epsFile): if line[:13] == '%%BoundingBox': lList = string.split(string.strip(line)) lList = map(int, lList[1:]) return tuple(lList) return (0, 0, 0, 0) def createGifImage(qMap, stub): imgStub = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data'], qMap['sol'], '%s'%stub) jpgFile = '%s.jpg'%imgStub pdfFile = '%s.pdf'%imgStub epsFile = '%s.eps'%imgStub if os.path.exists(jpgFile): return #endif os.system('convert %s %s > %s-convert.err'%(pdfFile, epsFile, epsFile)) (x1,y1, x2, y2) = getBoundingBox(epsFile) (width, height) = (x2-x1, y2-y1) #Compute new width (mi,mx) = (min(width, height), max(width, height)) scale = min(config.MinPixelDim/mi, config.MaxPixelDim/mx) gStr = '%dx%d'%(int(width*scale), int(height*scale)) os.system('convert -geometry %s -density 300 %s %s >> %s-convert.err'%(gStr, epsFile, jpgFile, jpgFile)) sizeFile = file('%s.imgSize'%epsFile[:-4], 'w') sizeFile.write('%d %d'%(width, height)) sizeFile.close() #end def def createVRMLWorld(clutoMap,qMap): dataDir = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data']) nClusters = os.path.join(dataDir, clutoMap['numclusters']) solDir = os.path.join(dataDir, qMap['sol']) vrmlFile = os.path.join(solDir, 'cluto.wrl') vrmlErr = os.path.join(solDir, 'cluto.wrl.err') if not os.path.exists(vrmlFile): dataset = os.path.join(dataDir, 'data.dmat') clustFile = os.path.join(solDir, 'cluto.clust') cmd = '%s %s %s %s > %s'%(config.VRMLGen, dataset, clustFile, nClusters, vrmlFile) retCode = os.system(cmd) #endif return vrmlFile #end def def buildCmd(clutoMap, qMap, solName): paramList = getClutoParamList(clutoMap) fileParamList = ['clustfile', 'cltreefile'] dataDir = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data']) solDir = os.path.join(dataDir, solName) cmdStrList = [] for param in paramList: cmdStrList.append('-%s=%s'%(param, clutoMap[param])) #end for for fileParam in fileParamList: fileN = os.path.join(solDir, 'cluto.%s'%fileParam[:-4]) cmdStrList.append('-%s=%s'%(fileParam, fileN)) #end for #Generate Tree str if clutoMap['clmethod'] <> 'graph': cmdStrList.append('-treefile=%s'%os.path.join(solDir, 'cluto.tree')) if clutoMap['clmethod'] in ['rb', 'rbr', 'direct']: cmdStrList.append('-fulltree') #endif if clutoMap['sim'] not in ['dist', 'jacc']: cmdStrList.append('-showfeatures') #endif if os.path.exists(os.path.join(dataDir, 'data.rlabel-short')): rLFileN = os.path.join(dataDir, 'data.rlabel-short') else: rLFileN = os.path.join(dataDir, 'data.rlabel') #endif if os.path.exists(os.path.join(dataDir, 'data.clabel-short')): cLFileN = os.path.join(dataDir, 'data.clabel-short') else: cLFileN = os.path.join(dataDir, 'data.clabel') #endif labelStr = '-rlabelfile=%s -clabelfile=%s'%(rLFileN, cLFileN) #plotStr = '-plotmatrix=%s/cluto.pdf -plotclusters=%s/clutoClust.pdf'%(solDir, solDir) plotStr = '-plotmatrix=%s/cluto.pdf'%(solDir) dataset = os.path.join(dataDir, 'data.dmat') otherParam = '-clustercolumns -zeroblack %s'%plotStr clutoParam = '%s %s %s %s %s'%(string.join(cmdStrList, ' '), labelStr, otherParam, dataset, clutoMap['numclusters']) return (clutoParam, paramList) def clusterData(qMap): sObj = getSessionObject(qMap) form = cgi.FieldStorage() (clutoMap, solName) = getClutoMap(form) datasetName = qMap['data'] solIdx = sObj.datasets.index(datasetName) solutions = sObj.solutions[solIdx] solName = solName and solName or 'sol-%d'%(len(solutions)+1) if solName in solutions: raise ClutoError('ClusterSolutionExists', [solName, qMap]) #endif ## >>> Beyond this point Solution Directory should be removed on *failure*. <<< solDir = os.path.join(config.ClutoDataDir, qMap['sesID'], datasetName, solName) os.mkdir(solDir, config.DirCreationMode) qMap['sol'] = solName #Do the clustering (clutoParam, printParamList) = buildCmd(clutoMap, qMap, solName) cmdFile = file(os.path.join(solDir, 'cluto.cmd'), 'w') cmdFile.write('%s'%clutoParam); cmdFile.close() printParamList.insert(1, 'numclusters') paramFile = file(os.path.join(solDir, 'cluto.param'), 'w') for paramKey in printParamList: paramFile.write('%s %s\n'%(paramKey,clutoMap[paramKey])) #end paramFile.close(); (optFileN, errFileN) = (os.path.join(solDir, 'cluto.output'), os.path.join(solDir, 'cluto.err')) retCode = os.system('%s %s > %s 2> %s'%(config.ClutoCmd, clutoParam, optFileN, errFileN)) if (retCode <> 0): errStr = file(errFileN).read() os.system('rm -rf %s'%solDir) raise ClutoError('ClusterClutoFailed', [errStr, qMap]) #endif ## errStr = cerr.read() ## if errStr != '': ## sys.stderr.write('Cluto Error: %s'%errStr) ## assert(0) ## return 0 ## #endif ## sys.stderr.write('Writing Output') ## outputFile = file(os.path.join(solDir, 'cluto.output'), 'w') ## outputFile.write(cout.read()) ## outputFile.close() ## sys.stderr.write('Done Writing Output') ## create Gif Image ##createGifImage(qMap): ## create VRML File createVRMLWorld(clutoMap,qMap) return 1 #end for def getSolutions(qMap): datasetDir = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data']) (solColumns, solData) = ([], []) for sName in glob.glob(datasetDir + '/*.clust'): solName = os.path.splitext(os.path.split(sName)[1])[0] solColumns.append(solName) solList = string.split(file(sName).read(), '\n') solData.append(solList) #end for if solData: solData = map(list, zip(*solData)) return (solColumns, solData) def addSolution(qMap): datasetDir = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data']) solDir = os.path.join(datasetDir, qMap['sol']) dataSoln = os.path.join(datasetDir, '%s.clust'%qMap['sol']) solSoln = os.path.join(solDir, 'cluto.clust') if not os.path.exists(dataSoln): oFile = file(dataSoln, 'w') iFile = file(solSoln) oFile.write(iFile.read()) oFile.close() iFile.close() #endif #end def shortenNames(namesList, maxSize): lenList = map(lambda x: len(x), namesList) if max(lenList) <= maxSize: return [] #endif (subSeqMap, sslen) = ({}, maxSize) for name in namesList: numSubStr = max(1, (len(name) - sslen)) ssMap = {} for i in range(numSubStr): ss = name[i:i+sslen] ssMap[ss] = 0 #endfor for ss in ssMap.keys(): val = subSeqMap.get(ss, 0) subSeqMap[ss] = val+1 #end #end for sNamesList = [] for name in namesList: (nLenList, newName) = ([], []) numSubStr = max(1, (len(name) - sslen)) for i in range(numSubStr): nLenList.append(subSeqMap[name[i:i+sslen]]) #end for minI = nLenList.index(min(nLenList)) if len(nLenList) == 1: newName = name; elif minI == 0: newName = name[:sslen] + '..' elif minI == len(nLenList)-1: newName = '%s..%s'%(name[0], name[minI:minI+sslen]) else: newName = '%s..%s..'%(name[0], name[minI:minI+sslen]) #endfor sNamesList.append(newName) #end for return sNamesList delimMap = { 'delimTab' :'\t', 'delimComma' :',', 'delimSemicolon':';', 'delimSpace' :' ' } def writeSMDData(qMap, form, datasetName): fileItem = form['uploadedFile'] (colNames, rowNames, dataList) = ([], [], []) (lineNum, numCols, uploadSz) = (0, 0, 0) for line in fileItem.file: line = string.replace(line, '\n', '') line = string.replace(line, '\r', '') if not line: continue #endif lineNum += 1 uploadSz += len(line) if uploadSz > config.MaxUploadSize: raise ClutoError('UploadGreaterFileSize', [config.MaxUploadSize, qMap]) #endif lList = string.split(line, '\t') if lineNum == 1: colNames = lList[3:] numCols = len(colNames) elif lineNum == 2: pass else: rowNames.append(lList[0]) lList[3:] = map(lambda x: (x == '') and '0.0' or x, lList[3:]) dataList.append(lList[3:]) if len(lList) <> (numCols+3): raise ClutoError('UploadIncorrectFields', [fileItem.filename, lineNum, line, datasetName, qMap]) #end #end if (len(dataList) == 0) or (numCols == 0): raise ClutoError('UploadEmptyFile', [fileItem.filename, qMap]) numRows = len(dataList) datasetDir = os.path.join(config.ClutoDataDir, qMap['sesID'], datasetName) os.mkdir(datasetDir, config.DirCreationMode) dFile = file('%s/data.dmat'%datasetDir, 'w') dFile.write('%d %d\n'%(numRows, numCols)) for lList in dataList: dFile.write('%s\n'%string.join(lList, ' ')) #end for dFile.close() rFile = file('%s/data.rlabel'%datasetDir, 'w') rFile.write('%s\n'%string.join(rowNames, '\n')) rFile.close() shortRowNames = shortenNames(rowNames, config.MaxRowLabelLen) if shortRowNames: srFile = file('%s/data.rlabel-short'%datasetDir, 'w') srFile.write('%s\n'%string.join(shortRowNames, '\n')) srFile.close() #endif cFile = file('%s/data.clabel'%datasetDir, 'w') cFile.write('%s\n'%string.join(colNames, '\n')) cFile.close() shortColNames = shortenNames(colNames, config.MaxColLabelLen) if shortColNames: scFile = file('%s/data.clabel-short'%datasetDir, 'w') scFile.write('%s\n'%string.join(shortColNames, '\n')) scFile.close() #endif qMap['data'] = datasetName return 1 #endif def writeDelimitedData(qMap, form, datasetName): firstRow = form.has_key('headerRow') and 1 or 0 firstCol = form.has_key('headerCol') and 1 or 0 delimiters = map(lambda x: delimMap[x], filter(lambda x: form.has_key(x), delimMap.keys())) if form.has_key('delimOther'): delimiters.append(form['otherText'].value) #endif if len(delimiters) == 0: raise ClutoError("ClusterNoDelimiter", [qMap]) splitRe = re.compile('[%s]*'%string.join(delimiters,'')) ## Parse the dataset fileItem = form['uploadedFile'] (colNames, rowNames, dataList) = ([], [], []) (firstLine, lineNum, numCols, uploadSz) = (1, 0, 0, 0) for line in fileItem.file: lineNum += 1 uploadSz += len(line) line = string.replace(line, '\n', '') line = string.replace(line, '\r', '') if uploadSz > config.MaxUploadSize: raise ClutoError('UploadGreaterFileSize', [config.MaxUploadSize, qMap]) #endif lList = splitRe.split(line) if lList[0] == '': lList.pop(0) #endif if firstCol: rowNames.append(lList.pop(0)) if firstLine: firstLine = 0 numCols = len(lList) if firstRow: colNames = lList else: dataList.append(lList) try: map(float, lList) except ValueError: raise ClutoError('UploadTextFound', [fileItem.filename, lineNum, line, qMap]) #except else: if len(lList) <> (numCols): raise ClutoError('UploadIncorrectFields', [fileItem.filename, lineNum, line, qMap]) dataList.append(lList) try: map(float, lList) except ValueError: raise ClutoError('UploadTextFound', [fileItem.filename, lineNum, line, qMap]) #except #end #end for if (len(dataList) == 0) or (numCols == 0): raise ClutoError('UploadEmptyFile', [fileItem.filename, qMap]) #endif numRows = len(dataList) if not colNames: colNames = map(lambda x: 'col%d'%x, range(1, numCols+1)) #endif if not rowNames: rowNames = map(lambda x: 'row%d'%x, range(1, numRows+1)) #endif ## Write the dataset file datasetDir = os.path.join(config.ClutoDataDir, qMap['sesID'], datasetName) os.mkdir(datasetDir, config.DirCreationMode) dFile = file('%s/data.dmat'%datasetDir, 'w') dFile.write('%d %d\n'%(numRows, numCols)) for lList in dataList: dFile.write('%s\n'%string.join(lList, ' ')) #end for dFile.close() rFile = file('%s/data.rlabel'%datasetDir, 'w') rFile.write('%s\n'%string.join(rowNames, '\n')) rFile.close() shortRowNames = shortenNames(rowNames, config.MaxRowLabelLen) if shortRowNames: srFile = file('%s/data.rlabel-short'%datasetDir, 'w') srFile.write('%s\n'%string.join(shortRowNames, '\n')) srFile.close() #endif cFile = file('%s/data.clabel'%datasetDir, 'w') cFile.write('%s\n'%string.join(colNames, '\n')) cFile.close() shortColNames = shortenNames(colNames, config.MaxColLabelLen) if shortColNames: scFile = file('%s/data.clabel-short'%datasetDir, 'w') scFile.write('%s\n'%string.join(shortColNames, '\n')) scFile.close() #endif qMap['data'] = datasetName #enddef def uploadData(qMap): sObj = getSessionObject(qMap) form = cgi.FieldStorage() ## Get the uploaded file Name fileItem = form['uploadedFile'] if fileItem.filename.find('\\') != -1: ## Windows file uFileName = string.split(os.path.splitext(fileItem.filename)[0], '\\')[-1] else: uFileName = os.path.split(os.path.splitext(fileItem.filename)[0])[1] datasetName = form.has_key('datasetName') and form['datasetName'].value or uFileName datasetName = string.replace(datasetName, ' ', '_') if re.compile('\s+').search(datasetName): raise ClutoError('UploadIncorrectFileName', [datasetName, qMap]) if datasetName in sObj.datasets: raise ClutoError('UploadFileExists', [datasetName, qMap]) if form['format'].value == 'SMD': writeSMDData(qMap, form, datasetName) else: writeDelimitedData(qMap, form, datasetName) #end return #end def getDatasetStats(qMap): datasetFileN = os.path.join(config.ClutoDataDir, qMap['sesID'], qMap['data'], 'data.dmat') dMap = {} dFile = file(datasetFileN) (dMap['NumRows'], dMap['NumCols']) = string.split(dFile.readline()) dFile.close() dMap['UploadTime'] = time.asctime(time.localtime(os.path.getmtime(datasetFileN))) dMap['FileSize'] = os.path.getsize(datasetFileN)/1000.0 return dMap def buildDatasetSolutionOptions(sObj, qMap): dataSelected = ('data' in qMap) solSelected = ('sol' in qMap) dOptStr = '