#! /usr/local/bin/python -- """ pylint.py Goal: Python Type-Inference style type-verification using the public Abstract Syntax Tree (AST) interface. This is a static-checker, meaning it checks code without running it at all. Authors: David Jeske (jeske@egroups.net) Scott Hassan (hassan@dotfunk.com) Date: 1999-11-21 Current State: Parts of Stage 1 and Stage 2 are both working. Simple test files can be parsed and their type-inference graphs can be printed. Documentation is spread throughout the file, and is grouped under the following four headings: 1. Theory of Operation 2. Implementation Overview 3. Type Classes 4. Parsing Object ----------[ 1. Theory of Operation ]--------------------------- ** Vocabulary: Type: Number, String, Float, or a particular type of object TypeSite: a location where a type can occur, such as a local or global variable, function argument, return value, etc. ** Stage 1, Parsing: In the parse stage, we must be able to walk an AST for a full-python program and build a type-inference graph. This graph should record occurances of types at different type-sites. ** Stage 2, list all types at all type sites: After parsing the program and building the inference graph, we should be able to dump out a list of all the types which are known to appear at a given type site. For example, given a module: def a(an_arg): print an_arg a(1) # pass in a number a("name") # pass in a string a(open("foo","w")) # call the builtin open and pass in a File object We should be able to print out a type signature for the method "a" similar to: Any <- a(int) Any <- a(string) Any <- a(Obj<__builtin__.File>) This will have two main uses: 1. It can be diffed against a previous listing to check changes in type signatures 2. it can be loaded into an editor in a ctags like manner to allow programmers to ask the editor for the "possible types" of an argument or variable, and thus "look up" the implementation of that type. ** Stage 3, Inference Type-Check: It is theorized that after the parse stage is done, the information collected can then be re-analyzed to determine both type violations (i.e. TTypes with invalid signatures appearing at a TSite such as variables, or function arguments), and type-risks (i.e. extremely-polymorphic use of objects at a TSite) In addition, a syntax for describing static protocol-conformance or type-conformance can easily be developed which this tool will understand and check. For example, this directive: def a(an_arg): "PYLINT: None <- (int)" print an_arg Might be used to explain that the only valid type of "an_arg" is "int" and that the function should always return None. This could incrementally be converted to be similar to one of the proposed static type check syntax for python. For example: def a:None(an_arg:int) print an_arg Merely by allowing these type declaration terms to be ignored by the existing Python interpreter. """ ##--------[ 2. Implementation Overview ]--------------------------- ## ## There is much more detailed information below. This is ## the high-level implementation overview. ## ## ParseClass is the workhorse. You hand it an AST ## for a codeblock, and it will walk the AST and build ## a type-inference graph out of what it sees. ## ## There are there base-types of items which make it ## it into the type-inference graph. TScope,TType, and TSite. ## A TSite is a site for receiver types and thus picks up several ## different TTypes. A TScope is a namespace for TSites. ## ## For example. If you wanted to static-populate the ## type signature for the builtin "int(a_val)" function, ## you would need to: ## ## 1. have a TScope to put the name "int" in (perhaps __builtin__) ## 2. create a TSite for the "int" slot in the scope ## 3. put a TFunc in the TSite (i.e. because the site is always a function) ## 4. give the TFunc a return signature (always an integer TNumber) ## 5. give the TFunc a MethodSignature consisting of a single argument ## site (another TSite) ## 6. in the first argument TSite, put the valid argument types. ## In this case, TNumber(int), TNumber(float), and TString, because ## these are the valid arguments to the builtin "int(a_val)" ## ## You can see how this is done in the __init__ method for ## ParseClass. Make a note that because this is a builtin, the ## MethodSignature is "frozen". This means that when people ## call the method, they are checked against it's existing ## signature, instead of just contributing to it. ## import os, sys, string, time, getopt, types import symbol, token ###----[ some utility functions ]-------------------------- ############### # sliceInN(a_list,N) # # this slices a list into another list of N wide pieces. # # for example: # sliceInN([1,2,3,4],2) -> [[1,2],[3,4]] # def sliceInN(a_list,N): new_list = [] while a_list: new_list.append(a_list[:N]) a_list = a_list[N:] return new_list ##---[ basic syntax tree printing ]--------------------- ## ## This is used just to print out the raw syntax tree. ## def symbolprint(name,level=0): pass print "S",level * " ", symbol.sym_name[name] def tokenprint(name, value, level=0): print "T",(level*" "), print token.tok_name[name], str(value) def mypprint(obj, level=0): next_is = None last_token = None for o in obj: try: if last_token == None: last_token = o if symbol.sym_name.has_key(o): symbolprint(o,level) elif symbol.sym_name.has_key(last_token): mypprint(o,level + 1) elif token.tok_name.has_key(last_token): tokenprint(last_token,o,level+1) last_token = None except TypeError: print (level*" "), print o def mipprint(mi, indent=0): print " " * indent, try: classes = mi._class_info.items() for name, o in classes: print "class", name mipprint(o, indent+1) except: pass try: functions = mi._function_info.items() for o in functions: print "func", name mipprint(o, indent+1) except: pass ##----------------------------------------------------- ##-----[ 3. Type Classes ]-------------------------------- ## ## Here starts the real stuff. It's still in flux ## and messy, but here is the general idea: ## ##### Specific Type Objects: ## ## TScope : a variable scope, like modules, classs, instances, ## functions, etc. Keep in mind that because all datatypes ## in Python are mostly objects, all datatypes in here ## are descendent from TScope. ## ## TClass, TModule, TInstance : are all directly descended from ## TScope. They represent different levels of scoping. ## They "are" their own type information. Although ## type use is stored within other objects inside the ## scope. TClasses (like TFuncs) can handle apply_trailer() ## (i.e. function invocation). For a TClasses this causes ## instantiation (returns a TInstance). ## ## TType: the concepts of type live here. I believe TType and ## TScope are going to converge into one object, with ## every specific type descending from there. However ## they started out separate and are unfortunatly still ## separate. ## A TType understands how to compare itself with another ## TType and figure out if they are the same type. It ## also holds the base implementation for applyBinaryOp(). ## ## TString, TNumber, TInvalid, TName, TNone : are all descended ## from TType. They represent basic types. TInvalid is ## propagated as a result of an Invalid type operation. ## TNumber understands the different number types and ## "upgrading". ## ## ## TFunc: A TFunc has a return type, which is derived from parsing ## the codeblock for the function. The return type is ## provided when a block of ## code causes an apply_trailer() (i.e. function activation) ## on the TFunc. It also has a MethodSignature. ## The MethodSignature can either be provide ahead of time ## and frozen (i.e. for builtin methods, or known methods), ## or it will be populated each time someone calls the ## Function (or method). ## ## MethodSignature: This object is far from completed. It only ## correctly handles non-named arguments right now. The ## idea is to have a set of ordered (and eventually named) ## TSites which are receivers for TTypes. There are a few ## ways the valid TTypes for an argument site can be ## derived (this is some practice and some theory): ## ## 1. You can predefine the type signature of the method ## and "freeze" it, as I've done for some builtin functions, ## and should be done for all known/builtin functions. ## 2. You can examine the types present at a call site. For ## example. If I call "foo(1,2)" I've declared that the ## method signature for foo() should include "int,int". ## Likewise, if I then call "foo("test",2)", I've ## declared that the method signature should include ## "string,int". Right now the implementation of ## MethodSignature merges these two into a unified ## signature: [ ["int","string"], "int" ], representing ## the possible types at each site. ## ## In hindsight, it ## should probably do work to keep separate signatures ## separate when possible, because the following ## two alternate signatures: [ "int", "string" ], and ## [ "string", "int" ] should NOT be merged as ## [ ["int", "string"], ["int", "string"]] as this has ## now implied both ["int", "int"] and ["string","string"], ## neither of which were used. ## ## 3. By examining use of the arguments inside a function ## codeblock, it _may_ be possible to determine the ## required type. For example, given the function: ## ## def fn(a,b): ## b.print(a + 2) ## ## We know that the "a" arg site MUST be a TNumber, ## and the "b" arg site MUST be capable of responding ## to a "TAny<-print(TNumber)" method. ## ##-------------[ TScope and friends ]----------- class TScope: # type sites def __init__(self): self.scope_data = {} def report(self, depth = 0,prefix=""): print prefix, self depth = depth + 1 for k,v in self.scope_data.items(): # print depth * " ", k,v try: v.report(depth = depth + 1,prefix= (depth * " " + k)) except AttributeError: pass def bindSite(self,site): if site.myscope and site.myscope != self: site.bindMe() else: print "BIND: %s.%s/%s" % (self,site.name,site) self.scope_data[site.name] = site return self def findSite(self,tname): if self.scope_data.has_key(tname.actual): return self.scope_data[tname.actual] else: newsite = TSite(tname.actual,scope=self) return newsite #----[ subclasses of TScope ]---- class TClass(TScope): def __init__(self,name = ""): TScope.__init__(self) self.name = name def apply_trailer(self, trailer): print "TClass:apply_trailer", trailer return TInstance(self) def __repr__(self): return "TClass<%s, %s>" % (self.name, self.scope_data.keys()) class TInstance(TScope): def __init__(self,classobj): TScope.__init__(self) self.classobj = classobj def findSite(self, tname): if self.scope_data.has_key(tname.actual): return TScope.findSite(self,tname) else: return self.classobj.findSite(tname) def __repr__(self): return "TInstance of %s" % self.classobj class TModule(TScope): def __init__(self,name = ""): TScope.__init__(self) self.name = name def __repr__(self): return "TModule: %s [%s]" % (self.name,self.scope_data.keys()) class TType(TScope): def __init__(self, actual=None): TScope.__init__(self) self.type = "TType" self.actual = actual def isSameTypeAs(self,anotherType): if self.__class__ == anotherType.__class__: return 1 else: return 0 def applyBinaryOp(self,op,arg): return self # def __repr__(self): # return "TType:%s<%s>" % (self.__class__,self.actual) ##----[ subclasses of TType ]---- class TString(TType): def __init__(self,actual): self.string = actual self.type = "TString" def findSite(self, name): return None class TInvalid(TType): def __init__(self): # put some line number info here or something pass class TName(TType): def __init__(self,actual): self.actual = actual class TNone(TType): def __init__(self,actual = None): self.actual = actual class TNumber(TType): def __init__(self,actual,c = None): self.actual = actual self.type = "TNumber" self.subtype = type(eval(actual)) if c is None: self.location = ("builtin") else: self.location = (c.module,c.line_no) self.typemap = { type(0) : 1, type(0.0) : 2, type(0L) : 3 } def applyBinaryOp(self,op,arg): if arg.__class__ != TNumber: return TInvalid() else: if self.typemap[self.subtype] > self.typemap[arg.subtype]: return self else: return arg def isSameTypeAs(self,anotherObj): if self.__class__ == anotherObj.__class__: if self.typemap[self.subtype] == anotherObj.typemap[anotherObj.subtype]: return 1 return 0 def __repr__(self): return "TNumber,%s<%s,%s>" % (self.location,self.subtype,self.actual) def findSite(self, name): return None class TFunc(TType): def __init__(self,rettype = None,methodsig = None): TScope.__init__(self) if rettype is None: self.rettype = TSite() else: self.rettype = rettype self.methodsig = methodsig def addArg(self,newArg): if self.methodsig is None: self.methodsig = MethodSignature() self.methodsig.addArgument(newArg) def addReturnType(self,newReturnType): self.rettype.addType(newReturnType) def setMethodSig(self,newMethodSig): self.methodsig = newMethodSig def apply_trailer(self, trailer): print "apply_trailer", trailer # check to see if the input type was correct! if self.methodsig.frozen: if not self.methodsig.doesSigMatch(trailer): return TInvalid() # should put in a line number! else: # extend the call type signature self.methodsig.mergeSignature(trailer) return self.rettype def __repr__(self): return "TFunc %s <-- <%s>" % (self.rettype,self.methodsig) class TSite(TType): def __init__(self,name = "",type_list = None,scope=None): TScope.__init__(self) self.name = name self.myscope = scope if type_list: self.value_list = type_list else: self.value_list = [] def report(self,depth=0,prefix=""): print prefix," ", self for type in self.value_list: type.report(depth = depth + 2) def bindMe(self): if not self.myscope is None: self.myscope.bindSite(self) self.myscope = None # nuke the circular reference def assignment(self,mytype): self.value_list = [mytype] def mergeType(self,newtype): for a_type in newtype.value_list: self.addType(a_type) def addType(self,newtype): # we should really make sure it does not already # exist in the typelist found_it = 0 for a_type in self.value_list: if a_type.isSameTypeAs(newtype): found_it = 1 if found_it is 0: self.value_list.append(newtype) else: print "I've already seen this type!! ", newtype def dereference(self): if len(self.value_list) > 1: print "*** dereference of value list!" return self.value_list[0] def checkCompatibility(self,anotherSite): for a_type in anotherSite.value_list: for my_types in self.value_list: if my_types.isSameTypeAs(a_type): return 1 # receiver types did not match! print "-[receiver type mismatch]---------------------------------------" print "Expected : " , self.value_list print "received : " , anotherSite.value_list return 0 def __repr__(self): return "TSite<%s:%s:%s>" % (self.name,self.value_list, self.scope_data.keys()) ############## ## MethodSignature() ## class MethodSignature: def __init__(self,arglist = None,frozen = 0): self.frozen = frozen self.sites = [] if type(arglist) == type([]): for an_arg in arglist: self.addArgument(an_arg) def addArgument(self, arg): self.sites.append(arg) def mergeSignature(self,anotherSig): for sitenum in range(len(self.sites)): self.sites[sitenum].mergeType(anotherSig.sites[sitenum]) def doesSigMatch(self,anotherSig): for sitenum in range(len(self.sites)): if not self.sites[sitenum].checkCompatibility(anotherSig.sites[sitenum]): return 0 return 1 def __repr__(self): return "MethodSignature<%s>" % str(self.sites) ##------------------------------------------------------------ ##------------------------------------------------------------ ##--------------[ 4. Parsing Object ]--------------------------- ########### ## Context ## ## This is just a container for passing args through the ## parse call-tree. class Context: def __init__(self, a_node, a_tree,module = "", line_no = None): self.sym_num = a_node self.sym_list = a_tree self.module = module self.line_no = line_no ########## ## ParseClass ## ## You point this class at an AST (Abstract Syntax Tree) ## and it does the work of walking the tree and assembling ## the type-inference graph. ## ## The method "parse_tree()" was the starting point and does ## the main dispatch to the other methods. For each AST node, ## there is a specific method to handle what occurs there. ## The methods are named after the node names, prepended with ## either "sym_" for symbol nodes, or "tok_" for token nodes. ## ## For example, this creates a number type from a NUMBER token: ## ## def tok_NUMBER(self,c): ## c.line_no = c.sym_list[1] ## return TNumber(c.sym_list[0],c) ## ## ...and this handles adding a return type to the current TScope ## from a return statement: ## ## def sym_return_stmt(self,c): ## if c.sym_list > 1: ## retval = self.parse_tree(c.sym_list[1],c) # "return " ## else: ## retval = TNone() # "return" ## ## self.stack[-1].addReturnType(retval) ## print "RETURN!!! " , retval ## ## return retval ## ## While the tree is parsed, types are both accumulated (for inference) ## and checked. If an invalid type coersion is performed, the resulting ## type will be TInvalid() and it will propagate through the inference ## graph. ## ## If a TSite with a frozen type signature is violated, a static-check ## error will occur. ## ## Otherwise, the parse propagates type-use around the code during the ## parse stage. It is theorized that after the parse stage is done, ## the information collected can then be re-analyzed to determine ## both type violations (i.e. TTypes with invalid signatures appearing ## at a TSite like a variable, or argument), and type-risks (i.e. ## extremely-polymorphic use of objects at a TSite) ## ## class ParseClass: def __init__(self,module_space = None): self.depth = 0 self.valuemode = "lvalue" if module_space is None: # setup builtin environment builtin_mod = TModule("__builtin__") self.module_space = builtin_mod self.globals = builtin_mod self.stack = [self.globals] tsite = TSite("None") tsite.assignment(TNone()) builtin_mod.bindSite(tsite) tsite = TSite("int") tsite.assignment(TFunc(TNumber("1"), MethodSignature(arglist=[TSite(type_list=[TNumber("1"), TNumber("1.1"), TNumber("1L"), TString("1")])], frozen = 1))) builtin_mod.bindSite(tsite) tsite = TSite("open") tsite.assignment(TFunc(TInstance(TClass("File")), MethodSignature(arglist=[TSite(type_list=[TString("1")]), TSite(type_list=[TString("1")]),], frozen = 1))) builtin_mod.bindSite(tsite) # add a new module def newModule(self,a_module,name): # bind it to a site in the module space! tsite = TSite(name) tsite.assignment(a_module) self.module_space.bindSite(tsite) def parseModule(self,a_tree, a_name): this_module = TModule(a_name) self.newModule(this_module,a_name) self.stack.append(this_module) self.parse_tree(a_tree,Context(None,None,0)) self.stack.pop() def parse_tree(self, a_tree, c): a_node = a_tree[0] if symbol.sym_name.has_key(a_node): # yes, it's a symbol! try: a = getattr(self,"sym_%s" % symbol.sym_name[a_node]) except AttributeError: a = getattr(self,"sym_DEFAULT") elif token.tok_name.has_key(a_node): # yes, it's a token! try: a = getattr(self,"tok_%s" % token.tok_name[a_node]) except AttributeError: a = getattr(self,"tok_DEFAULT") c = Context(a_node, a_tree[1:], module = "__main__") self.depth = self.depth + 1 val = a(c) self.depth = self.depth - 1 return val def tok_NUMBER(self,c): c.line_no = c.sym_list[1] return TNumber(c.sym_list[0],c) def tok_STRING(self,c): return TString(c.sym_list[0]) def tok_NAME(self,c): return TName(c.sym_list[0]) def sym_return_stmt(self,c): if c.sym_list > 1: retval = self.parse_tree(c.sym_list[1],c) # "return " else: retval = TNone() # "return" self.stack[-1].addReturnType(retval) print "RETURN!!! " , retval return retval def sym_atom(self,c): curterm = self.parse_tree(c.sym_list[0],c) if curterm.__class__ == TName: curterm = self.stack[-1].findSite(curterm) if self.valuemode == "rvalue": curterm = curterm.dereference() else: #raise "ParseCrap", "sym_atom: " + str(curterm) pass return curterm def sym_power(self,c): curterm = self.parse_tree(c.sym_list[0],c) print "sym_power", curterm, c.sym_list[0] term_list = c.sym_list[1:] for trailer in term_list: ptree = self.parse_tree(trailer,c) if ptree.__class__ == MethodSignature: if curterm.__class__ == TSite: curterm = curterm.dereference() curterm = curterm.apply_trailer(ptree) elif ptree.__class__ == TName: if curterm.__class__ == TSite: curterm = curterm.dereference() curterm = curterm.findSite(ptree) return curterm def sym_trailer(self, c): op = self.parse_tree(c.sym_list[0],c) print "sym_trailer", op if op == "(": args = c.sym_list[1:-1] if args: curterm = self.parse_tree(c.sym_list[1],c) print "sym_trailer", curterm else: curterm = MethodSignature() elif op == ".": curterm = self.parse_tree(c.sym_list[1],c) elif op == "[": curterm = self.parse_tree(c.sym_list[1],c) dummy = self.parse_tree(c.sym_list[1],c) return curterm def sym_arglist(self, c): signature = MethodSignature() curterm = self.parse_tree(c.sym_list[0],c) signature.addArgument(curterm) term_list = c.sym_list[1:] for op,term in sliceInN(term_list,2): curterm = self.parse_tree(term,c) signature.addArgument(curterm) return signature def sym_argument(self, c): last_term = c.sym_list[-1] front_terms = c.sym_list[:-1] last_val = self.parse_tree(last_term,c) if front_terms: name = self.parse_tree(front_terms[0],c) a = TSite(name=name) else: a = TSite() a.assignment(last_val) return a def sym_expr_stmt(self,c): last_term = c.sym_list[-1] front_terms = c.sym_list[:-1] self.valuemode = "rvalue" last_val = self.parse_tree(last_term,c) self.valuemode = "lvalue" for site_node,op in sliceInN(front_terms,2): site = self.parse_tree(site_node,c) print "SITE: %s/%s" % (site,site.myscope) self.stack[-1].bindSite(site) site.assignment(last_val) return last_term def sym_arith_expr(self,c): curterm = self.parse_tree(c.sym_list[0],c) term_list = c.sym_list[1:] for op,term in sliceInN(term_list,2): curterm = curterm.applyBinaryOp(self.parse_tree(op,c), self.parse_tree(term,c)) print curterm return curterm ### sym_term is the same sym_term = sym_arith_expr def sym_parameters(self,c): return self.parse_tree(c.sym_list[1],c) def sym_varargslist(self,c): arg1 = self.parse_tree(c.sym_list[0],c) argsite = TSite(arg1.actual) self.stack[-1].addArg(argsite) return argsite def sym_funcdef(self,c): funcname = self.parse_tree(c.sym_list[1],c) print self.depth * " ", symbol.sym_name[c.sym_num] , "FUNCDEF!!!", funcname newsite = TSite(funcname.actual) # create site funcscope = TFunc() self.stack.append(funcscope) for a_elem in c.sym_list: self.parse_tree(a_elem, c) self.stack.pop() # funcscope.setMethodSig( MethodSignature(arglist=[TSite(type_list=[TNumber("1")])]) ) newsite.assignment(funcscope) self.stack[-1].bindSite(newsite) # bind it return funcscope def sym_classdef(self,c): classname = self.parse_tree(c.sym_list[1],c) newsite = TSite(classname.actual) # create site classscope = TClass(classname.actual) newsite.assignment(classscope) self.stack[-1].bindSite(newsite) # bind it self.stack.append(classscope) for a_elem in c.sym_list: self.parse_tree(a_elem, c) self.stack.pop() return classscope def sym_DEFAULT(self,c): print self.depth * " ", symbol.sym_name[c.sym_num] if len(c.sym_list) == 1: return self.parse_tree(c.sym_list[0], c) else: for a_elem in c.sym_list: self.parse_tree(a_elem, c) def tok_DOT(self,c): return "." def tok_LPAR(self,c): return "(" def tok_DEFAULT(self,c): print self.depth * " " , token.tok_name[c.sym_num], c.sym_list return TType(actual=c.sym_list) def report(self,depth = 0): self.globals.report() # for k,v in self.globals.items(): # print depth * " ", k,v.report(depth + 1) #---------------------------------------------------------- def usage(): print "pylint.py " def main(argv, stdout, environ): import parser if len(argv) != 2: usage() else: # fn = "a.py" fn = argv[1] print "*** Parsing %s" % fn ast = parser.suite(open(fn).read()) l = ast.tolist(1) if 0: print dir(ast) print ast import pprint pprint.pprint(l) print mypprint(l) if 1: pc = ParseClass() pc.parseModule(l, "__main__") print "------------------------" pc.report() if 0: import example mi = example.ModuleInfo(l, fn) mipprint(mi) if __name__ == "__main__": main(sys.argv, sys.stdout, os.environ)