import math import string import copy import operator class Token(object): item = None feat = None def __init__(self, ic, props): if (ic.__class__ is Item): self.item = ic self.feat = Feature(Document(props)) elif (ic.__class__ is Feature): self.feat = ic self.item = Item(Document(props)) class Tokenizable(object): pass class TokString(Tokenizable): def __init__(self, strng): self.props = Properties([('content', strng)]) def tokenize(self): return [Properties([('content', strng)]) for strng in self.props['content'].split()] def __str__(self): return str(self.props['content']) __repr__ = __str__ class Document(Tokenizable): def __init__(self, props): self.props = Properties(props) def __repr__(self): return repr(self.props) def tokenize(self): res = [] # print 'tokenizing ' + repr(self.props) if 'content' in self.props: for strng in self.props['content'].split(): props = copy.copy(self.props) props['content'] = strng res.append(props) if 'docs' in self.props: for doc in self.props['docs']: res += doc.tokenize() return res class Properties(dict): """Properties of items, features, tokens etc. These are separate from the identity of the items, etc. whose properties they describe. Properties form a partial subsumption ordering.""" def __ge__(self, other): """Is self less specific than other?""" for key in self: if (key in other): if (other[key] != self[key]): return False else: return False return True def __le__(self, other): """Is self more specific than other?""" for key in other: if (key in self): if (self[key] != other[key]): return False else: return False return True def __hash__(self): """Properties are not expected to change, so they're hashable.""" return reduce(operator.xor, [hash(str(x)) for x in self], 0) def asTeX(self): ## texstr = '\( \\begin{bmatrix}' ## lines = [] ## if 'content' in self: ## lines.append('\\text{lemma:} & \\text{%s}' % (self['content'])) ## if 'lang' in self: ## lines.append('\\text{language:} & \\text{%s}' % (self['lang'])) ## if 'docs' in self: ## lines.append('\\text{lemma:} & \\text{%s}' % string.join([ x.props.asTeX() for x in self['docs']], ', ')) ## texstr += string.join(lines, '\\\\') ## texstr += '\end{bmatrix} \)' ## return texstr if 'content' in self: return self['content'] if 'docs' in self: return '[' + string.join([ x.props.asTeX() for x in self['docs']], ', ') + ']' class IndexComp(object): content = None tokens = [] def __new__(cls, tokenizable): # print 'Looking for ' + cls.__name__ + ': ' + repr(tokenizable) if (tokenizable.props in cls.map): tokenizable.props = copy.copy(cls.map[tokenizable.props]) if (tokenizable.props in cls.list): return cls.list[tokenizable.props] else: self = object.__new__(cls) # print 'In ' + self.__class__.__name__ + ' constructor' self.content = tokenizable.props; self.__class__.list[tokenizable.props] = self # print tokenizable self.tokens = [ Token(self, tok) for tok in tokenizable.tokenize() ] # print self.__class__.list return self def __le__(self, other): return self.content <= other.content def __ge__(self, other): return self.content >= other.content def __repr__(self): return repr(self.content) + repr(self.tokens) def asTeX(self): return self.content.asTeX() class Item(IndexComp): list = {} map = {} class Feature(IndexComp): list = {} map = {} class IRstruct(object): def __init__(self, items, features): self.items = [] self.feats = [] self.matrix = [] for item in items: self.items.append(Item(item)) for feat in features: self.feats.append(Feature(feat)) def featfreq(self, feat, item): freq = 0 for token in feat.tokens + item.tokens: if token.feat <= feat and token.item <= item: freq += 1 return freq def itemfreq(self, feat): freq = 0 for item in self.items: for token in feat.tokens + item.tokens: if (token.feat <= feat and token.item <= item): freq += 1 break return freq def iif(self, feat): return math.log((1.0 + len(self.items))/(1.0 + self.itemfreq(feat))) def maxff(self, item): return max([self.featfreq(x, item) for x in self.feats] + [ 0 ]) def weightp(self, item, feat): return (0.5 + 0.5 * (self.featfreq(feat, item) / self.maxff(item))) * self.iif(feat) def weight(self, item, feat): def sum(list): return reduce(operator.add, list, 0.0) return self.weightp(item, feat) / math.sqrt(sum([self.weightp(item, f)**2 for f in self.feats])) def itemvec(self, item): return [self.weight(item, f) for f in self.feats] def sim(self, i, j): def sum(list): return reduce(operator.add, list, 0) return sum([ self.weight(i, f)*self.weight(j, f) for f in self.feats]) def simMatrix(self): self.matrix = dict([((p,q), self.sim(p, q)) for q in self.items for p in self.items]) return self.matrix def __repr__(self): if (not self.matrix): self.simMatrix() return repr(self.matrix); def asTeX(self): if (not self.matrix): self.simMatrix() texstring = '' texstring += '\\begin{description}\n' texstring += '\\item[Items] ' + string.join([x.asTeX() for x in self.items], ', ') + '\n' texstring += '\\item[Features] ' + string.join([x.asTeX() for x in self.feats], ', ') + '\n' texstring += '\\end{description}\n\n' # texstring += '\\begin{tabular}{r' + 'c'*len(self.items) + '}\n' # texstring += '& \\rotatebox{75}{' + string.join([repr(x) for x in self.items], '} & \\rotatebox{75}{') + '}\\\\\n' # texstring += string.join( # [repr(p) + ' & ' + string.join( # [ "%3.2f" % self.matrix[p, q] for q in self.items], ' & ') for p in self.items], '\\\\\n') # texstring += '\n\\end{tabular}\n' return texstring def asMP(self): if (not self.matrix): self.simMatrix() mpstring = '' mpstring = ''' verbatimtex %&latex \documentclass{scrartcl} \usepackage{amsmath} \usepackage[T1]{fontenc} \usepackage{concrete,euler} \\begin{document} etex prologues := 1; %input TEX; ''' mpstring += 'beginfig(1);\n' cp = cq = 0 # row labels for p in self.items: mpstring += 'draw thelabel.lft(btex %s etex, (%fcm, %fcm));\n' % (p.asTeX(), -1, -cp) cp += 1 # column labels for q in self.items: mpstring += 'draw thelabel.urt(btex %s etex rotated 45, (%fcm, %fcm)) shifted (-.2cm,0);\n' % (q.asTeX(), cq, 1) cq += 1 for x in range(0, cp): mpstring += 'draw (%fcm, %fcm)--(%fcm, %fcm) dashed withdots;\n' % (-1, -x, cq - .5, -x) for x in range(0, cq): mpstring += 'draw (%fcm, %fcm)--(%fcm, %fcm) dashed withdots;\n' % (x, 1, x, -cp + .5) cp = 0 for p in self.items: cq = 0 for q in self.items: if (self.matrix[p, q] > 0.0): mpstring += 'path b;\n picture l;\n' mpstring += 'b = fullcircle scaled %fcm shifted (%fcm,-%fcm);\n' % (.9 * self.matrix[p, q], cq, cp) mpstring += 'l = thelabel(btex $%3.2f$ etex, (%fcm, -%fcm));\n' % (self.matrix[p, q], cq, cp) mpstring += 'fill bbox l withcolor white;\n' mpstring += 'fill b withcolor .7white;\n' mpstring += 'draw l;\n' # mpstring += 'draw b;\n' cq += 1 cp += 1 mpstring += 'endfig;\n' mpstring += 'end;\n' return mpstring class SimThes(IRstruct): def iif(self, feat): return 1.0/math.log(self.itemfreq(feat) + 1) def weightp(self, item, feat): return self.featfreq(feat, item) * self.iif(feat) ## def simqt(self, query, t): ## simqt = 0; ## for s, weight in query: ## simqt += self.matrix[s, t] * weight ## return simqt; ## def expand(self, query): ## norm = 0 ## result = [] ## for item, weight in query: ## norm += weight; ## for term in self.items: ## simqt = self.simqt(query, term); ## if simqt > 0: ## result.append((Term, simqt/weight)) ## return result def simMatrix(self): self.coeffs = dict([(x, 0) for x in self.items]) self.sim = dict([((p, q), 0) for q in self.items for p in self.items]) self.matrix = dict([((p, q), 0) for q in self.items for p in self.items]) # print self.coeffs for feat in self.feats: items = [token.item for token in feat.tokens if token.item != None ] items.sort() itemsp = copy.copy(items) for p in items: if p in self.items: # print 'coeffs(' + str(p) + ') += ' + str(self.weightp(p, feat)**2); self.coeffs[p] += self.weightp(p, feat)**2 for q in itemsp: if q in self.items and (p < q or p == q): self.sim[p, q] += self.weightp(p, feat)*self.weightp(q, feat) for p in self.items: for q in self.items: if ((p < q or p == q) and self.sim[p, q] > 0): self.matrix[p, q] = self.matrix[q, p] = self.sim[p, q] / math.sqrt(self.coeffs[p] * self.coeffs[q]) return self.matrix class SimThes_CL(SimThes): def asMP(self): if (not self.matrix): self.simMatrix() mpstring = '' mpstring = ''' verbatimtex %&latex \documentclass{scrartcl} \usepackage{amsmath} \usepackage[T1]{fontenc} \usepackage{concrete,euler} \\begin{document} etex prologues := 1; %input TEX; ''' mpstring += 'beginfig(1);\n' cp = cq = 0 # row labels for p in self.items: if (p.content['lang'] == 'de'): mpstring += 'draw thelabel.lft(btex %s etex, (%fcm, %fcm));\n' % (p.asTeX(), -1, -cp) cp += 1 # column labels for q in self.items: if (q.content['lang'] == 'en'): mpstring += 'draw thelabel.urt(btex %s etex rotated 45, (%fcm, %fcm)) shifted (-.2cm,0);\n' % (q.asTeX(), cq, 1) cq += 1 for x in range(0, cp): mpstring += 'draw (%fcm, %fcm)--(%fcm, %fcm) dashed withdots;\n' % (-1, -x, cq - .5, -x) for x in range(0, cq): mpstring += 'draw (%fcm, %fcm)--(%fcm, %fcm) dashed withdots;\n' % (x, 1, x, -cp + .5) cp = 0 for p in self.items: cq = 0 if (p.content['lang'] == 'de'): for q in self.items: if (q.content['lang'] == 'en'): if (self.matrix[p, q] > 0.0): mpstring += 'path b;\n picture l;\n' mpstring += 'b = fullcircle scaled %fcm shifted (%fcm,-%fcm);\n' % (.9 * self.matrix[p, q], cq, cp) mpstring += 'l = thelabel(btex $%3.2f$ etex, (%fcm, -%fcm));\n' % (self.matrix[p, q], cq, cp) mpstring += 'fill bbox l withcolor white;\n' mpstring += 'fill b withcolor .7white;\n' mpstring += 'draw l;\n' # mpstring += 'draw b;\n' cq += 1 cp += 1 mpstring += 'endfig;\n' mpstring += 'end;\n' return mpstring class TextRetriev(IRstruct): pass