""" src2doc.py -- Takes a source file and makes a document file. Default the source is expected to be Python, and the generated document will be {\LaTeX}. Unless other notices are present in any part of this file explicitly claiming copyrights for other people and/or organizations, the contents of this file is fully copyright (C) 1997 Norut IT, all rights reserved. Permission is granted to make and distribute verbatim copies of this document provided that the copyright notice and this permission notice are preserved on all copies. Permission is granted to copy and distribute modified versions of this document under the conditions for verbatim copying, provided that the entire resulting derived work is distributed under the terms of a permission notice identical to this one. If any other present copyright notices interfere with the copyright claims above, these claims may be partially overruled by those notices. """ rcsFile = "$RCSfile: src2doc.py,v $" rcsDate = "$Date: 2001-08-09 12:33:27 $" rcsRev = "$Revision: 1.2 $" rcsState = "$State: Exp $" rcsAuthor = "$Author: aa $" rcsLog = """ $Log: src2doc.py,v $ Revision 1.2 2001-08-09 12:33:27 aa Tupple stuff (changes in Python between 1.5.2 and 2.0.1?) Revision 1.1 2000/07/25 13:42:14 aa Moved to notebook (aa) Revision 0.4 1997/11/29 18:45:56 anders Added support for raw string format (r|R) in Python 1.5. Revision 0.3 1997/10/17 22:10:13 anders Changed the doc string cleaned up the layout for the Src and Doc class. Revision 0.2 1997/07/10 13:35:45 anders Made all values in option dictionaries strings. Revision 0.1 1997/05/26 00:01:37 anders Initial working version based on src2ltx v0.1. """ """ We are importing some library modules. We need \texttt{regex} because we are using regular expression to describe source tokens. The \texttt{string} module gives us functions for string manipulation. We need the stdio stuff from \texttt{sys} and we use \texttt{types} to know the types of some objects. """ import regex # Use the new "re" next time import string import sys from types import * """ An exception used when we don't find a matching component in the source. """ NoSymbolMatch = "NoSymbolMatch" class Src: """ The Src class is the specification of the source. It must contain two lists of two-tupples; \texttt{srctok}, which specifies the tokens to be recognized, and \texttt{srccomp}, which specifies the language components to be recognized (language components are made from tokens in srctok). It also contains an option dictionary. The \texttt{ws} token must be defined in \texttt{srctok}. We also have six tokens that are not defined in \texttt{srctok}; \texttt{bod} (beginning of document), \texttt{eod} (end of document), \texttt{bof} (beginning of file), \texttt{eof} (end of file), \texttt{bol} (beginning of line) and \texttt{eol} (end of line). These are automatically inserted in the input stream. """ """ The option dictionary for Src (with default values). """ option = { "startline": "1", # Usual starts at first line "stopline": "-1", # and stops at last line (-1). "tabsize": "8", # Deafault tabular size is 8. "currcomp": ""} """ Used in the token definitions. """ whitespaces = "[\011\013\014 ]+" symbolstr = "[-:=;\.<>\+\*/!%,|&^~]" symbols = "(\|)\|\[\|\]\|'\|\"\|`\|{\|}\|#" name = "[a-zA-Z_][a-zA-Z0-9_]*" escape = "\\\\\(" + symbols + "\|" + symbolstr + "\|[a-zA-Z0-9]+\)" numbers = "0x[" + string.hexdigits + "]+" numbers = numbers + "\|[0-9]*\.?[0-9]+L?\(e-?[0-9]+\)?" keywords = "access\|and\|break\|class\|continue\|def\|del\|elif" keywords = keywords + "\|else\|except\|exec\|finally\|for\|from" keywords = keywords + "\|global\|if\|import\|in\|is\|lambda\|not" keywords = keywords + "\|or\|pass\|print\|raise\|return\|try\|while" """ Tokens recognized in the source language. """ srctok = [ ("esc", (escape , [])), ("name", (name, [ ("kw", (keywords, [ ("def", ("def", [])), ("cls", ("class", [])), ("from", ("from", [])), ("import", ("import", []))])), ("r", ("r\|R", []))])), ("num", (numbers, [])), ("ws", (whitespaces, [])), ("symstr", (symbolstr, [ ("period", ("\.", [])), ("comma", (",", [])), ("asterix", ("\*", []))])), ("sym", (symbols, [ ("quote", ("'", [])), ("dquote", ('"', [])), ("bquote", ("`", [])), ("comchar", ("#", []))]))] """ Source components recognized. We are using list and not dictionary because the order is important (we search for components in this order). """ srccomp = [ ("docstring", [ ("#", "bol"), ("?", "ws"), ("%", [ ("?", "r"), ("", "quote"), ("", "quote"), ("", "quote")]), ("*%", [ ("^", [ ("", "quote"), ("", "quote"), ("", "quote")])]), ("%", [ ("", "quote"), ("", "quote"), ("", "quote")]), ("%", [ ("?", "ws"), ("", "eol")])]), ("docstring", [ ("#", "bol"), ("?", "ws"), ("%", [ ("?", "r"), ("", "dquote"), ("", "dquote"), ("", "dquote")]), ("*%", [ ("^", [ ("", "dquote"), ("", "dquote"), ("", "dquote")])]), ("%", [ ("", "dquote"), ("", "dquote"), ("", "dquote")]), ("%", [ ("*", "ws"), ("", "eol")])]), ("quotepar", [ ("", [ ("?", "r"), ("", "quote"), ("", "quote"), ("", "quote")]), ("*", [ ("^", [ ("", "quote"), ("", "quote"), ("", "quote")])]), ("", [ ("", "quote"), ("", "quote"), ("", "quote")])]), ("quotepar", [ ("", [ ("?", "r"), ("", "dquote"), ("", "dquote"), ("", "dquote")]), ("*", [ ("^", [ ("", "dquote"), ("", "dquote"), ("", "dquote")])]), ("", [ ("", "dquote"), ("", "dquote"), ("", "dquote")])]), ("quotepar", [ ("", [ ("?", "r"), ("", "bquote"), ("", "bquote"), ("", "bquote")]), ("*", [ ("^", [ ("", "bquote"), ("", "bquote"), ("", "bquote")])]), ("", [ ("", "bquote"), ("", "bquote"), ("", "bquote")])]), ("quotestring", [ ("?", "r"), ("", "quote"), ("*", [ ("^", "quote")]), ("", "quote")]), ("quotestring", [ ("?", "r"), ("", "dquote"), ("*", [ ("^", "dquote")]), ("", "dquote")]), ("quotestring", [ ("?", "r"), ("", "bquote"), ("*", [ ("^", "bquote")]), ("", "bquote")]), ("bol", [ ("", "bol")]), ("eol", [ ("", "eol")]), ("keyword", [ ("", "kw")]), ("comment", [ ("", "comchar"), ("*", [ ("^", "eol")])]), ("function", [ ("", "def"), ("", "ws"), ("", "name")]), ("class", [ ("", "cls"), ("", "ws"), ("", "name")]), ("importstm", [ ("", "import"), ("", "ws"), ("", [ ("", "name"), ("*", [ ("", "period"), ("", "name")]), ("*", [ ("?", "ws"), ("", "comma"), ("?", "ws"), ("", [ ("", "name"), ("*", [ ("", "period"), ("", "name")])])])])]), ("fromstm", [ ("", "from"), ("", "ws"), ("", [ ("", "name"), ("*", [ ("", "period"), ("", "name")])]), ("", "ws"), ("", "import"), ("", "ws"), ("|", [ ("", "asterix"), ("", [ ("", "name"), ("*", [ ("?", "ws"), ("", "comma"), ("?", "ws"), ("", "name")])])])])] class Doc: """ The Doc class contains information about how to generate the document format. It must contain three different dictionaries; \texttt{charmap} to map illegal document characters to strings, \texttt{tokmap} to map tokens to document strings, and \texttt{compmap} to map language components to document text. There is also an option dictionary, used for example in the mapping of tokens (see \texttt{mapTok()}). """ """ The option dictionary for Doc (with default values). """ option = { "document": "1", # Generate document by default "cls": "aasrc", # using the "aasrc" class "clsopt": "", # with no options "preamble": "", # and no preamble "title": ""} # using the default title. """ The character mapping to avoid illegal document (LaTeX) characters. """ charmap = { "%": "\\%", "$": "\\$", "&": "\\&", "#": "\\#", "_": "{\\aaus}", "{": "{\\aalbrace}", "}": "{\\aarbrace}", "\\": "{\\aabs}", "^": "{\\aahat}", "~": "{\\aatilde}", "|": "{\\aabar}", "<": "{\\aalt}", ">": "{\\aagt}", "*": "{\\aast}"} """ The mapping from source tokens to document dependent tokens. The substrings (except \texttt{token}) is fetched from the option dictionary in this class. The first element in the tupple is the new name of the token (after the map). """ tokmap = { "bod": ("", ("\\documentclass%(clsopt)s{%(cls)s}\n%(preamble)s" + "\\begin{document}\n\\title{%(title)s}\n%(token)s")), "eod": ("", "%(token)s\\end{document}\n"), "bof": ("", "\\begin{aasrc}\n%(token)s"), "eof": ("", "%(token)s\\end{aasrc}\n"), "bol": ("bol", "%(token)s"), "eol": ("eol", "%(token)s"), "ws": ("ws", "\\aaws{%(token)s}")} """ The mapping from language to document components. Be aware that there should be a match between the number of substrings (%s) in the document format presented here and the the number of tuppels in the description list for the coresponding source language component (\texttt{Src.srccomp}). """ compmap = { "docstring": "\\aadoc{%s}{%s}{%s}%s\\endaadoc{%s}%s", "quotepar": "\\aaqt{%s%s%s}", "quotestring": "\\aaqt{%s%s%s%s}", "bol": "\\aaline{%s}{", "eol": "}%s", "keyword": "\\aakw{%s}", "comment": "%s\\aacom{%s}", "function": "\\aakw{%s}%s\\aafunc{%s}", "class": "\\aakw{%s}%s\\aacls{%s}", "importstm": "\\aakw{%s}%s\\aamod{%s}", "fromstm": "\\aakw{%s}%s\\aamod{%s}%s\\aakw{%s}%s\\aanm{%s}"} class SrcTok: """ Generate tokens from the source. """ def __init__(self, input=None, src=None, doc=None): """ Initialize the SrcTok class. """ if input: self.input = input else: self.input = sys.stdin if src: self.src = src else: self.src = Src() if doc: self.doc = doc else: self.doc = Doc() self.compiledtokens = self.compileSrcTokens(self.src.srctok) self.compiledall = regex.compile(string.join(map( lambda tokenit: tokenit[1][0], self.src.srctok), "\|")) self.linenum = 1 while self.linenum < string.atoi(self.src.option["startline"]): if not self.input.readline(): break self.linenum = self.linenum + 1 def compileSrcTokens(self, srctok): """ Make a compiled srctokens tree. """ compiled = [] for (name, spec) in srctok: compiled.append((name, regex.compile(spec[0]), self.compileSrcTokens(spec[1]))) return compiled def searchToken(self, line, pos): """ Search for next token. """ return self.compiledall.search(line, pos) def bestToken(self, line, pos, compiled=[]): """ Find the best token match (most specialized) at this position. """ if not compiled: compiled = self.compiledtokens for token in compiled: length = token[1].match(line, pos) if length > 0: if token[2]: (nlength, nname) = self.bestToken(line, pos, token[2]) if nlength == length: return (nlength, nname) return (length, token[0]) return (0, "") def decompLine(self, tokens, line): """ Decompose a line to a list of tokens. """ tabsize = string.atoi(self.src.option["tabsize"]) linepos = index = 0 pos = self.searchToken(line, index) while pos != -1: (length, name) = self.bestToken(line, pos) if pos > index: tokens.append(("", line[index:pos])) if name == "ws": num = 0 for wsi in range(pos, pos + length): if line[wsi] == "\t": num = num + (tabsize - ((linepos + num) % tabsize)) else: num = num + 1 linepos = linepos + num token = `num` else: linepos = linepos + length token = line[pos:pos+length] tokens.append((name, token)) index = pos + length pos = self.searchToken(line, index) if index < len(line): tokens.append(("", line[index:len(line)])) def fetchTokens(self, tokens): """ Fetch a new line from the source and decompose it to tokens. Returns false if there ain't no line to fetch. """ if (string.atoi(self.src.option["stopline"]) == -1 or self.linenum <= string.atoi(self.src.option["stopline"])): line = self.input.readline() debug.write("%s." % `self.linenum`) self.linenum = self.linenum + 1 else: line = "" if not line: return 0 tokens.append(("bol", `self.linenum - 1`)) self.decompLine(tokens, line[:-1]) tokens.append(("eol", "\n")) return 1 class Tok: """ A class to manage tokens (with possible check points). """ def __init__(self, srctok=None, pre=[], post=[]): """ Initialize the Tok class. """ if srctok: self.srctok = srctok else: self.srctok = SrcTok() self.src = self.srctok.src self.doc = self.srctok.doc self.tokenpos = -1 self.nextlist = [] if pre: self.tokens = pre else: if self.doc.option["document"] != "0": self.tokens = [("bod", "")] debug.write("Generating document\n") else: debug.write("Generating environment\n") self.tokens = [("bof", "")] if post: self.posttok = post else: if self.doc.option["document"] != "0": self.posttok = [("eod", "")] else: self.posttok = [("eof", "")] def checkPoint(self): """ Make a check point. We must save the position off current token so we can rollback. """ self.nextlist.append(self.tokenpos) def commit(self): """ Ok, we committed the sequence of tokens from last check point. """ del self.nextlist[-1] def rollBack(self): """ Don't commit the token sequence. Rollback to the last check point. """ self.tokenpos = self.nextlist[-1] del self.nextlist[-1] def next(self): """ Fetch the next token. We may have to fetch a new line from the source file (with fetchTokens). """ if self.tokenpos + 1 < len(self.tokens): self.tokenpos = self.tokenpos + 1 else: if self.nextlist: self.tokenpos = self.tokenpos + 1 else: self.tokens = [] self.tokenpos = 0 if not self.srctok.fetchTokens(self.tokens): if self.posttok: self.tokens = self.tokens + self.posttok self.posttok = [] else: raise IndexError def current(self): """ Returns the current token. """ return self.tokens[self.tokenpos] class TokComp: """ Find components. """ def __init__(self, tok=None): """ Initialize the TokComp class. """ if tok: self.tok = tok else: self.tok = Tok() self.src = self.tok.src self.doc = self.tok.doc def mapChars(self, text, start=0, stop=0): """ Using charmap to map illegal document cahracters to commands. """ if stop == 0: stop = len(text) ttext = "" for index in range(start, stop): try: ttext = ttext + self.doc.charmap[text[index]] except KeyError: ttext = ttext + text[index] return ttext def mapTok(self, mod): """ Using tokmap to map language tokens to document tokens. """ (name, text) = self.tok.current() currcomp = self.src.option["currcomp"] if name == "bol" and not currcomp == "bol": try: tt = string.split(self.doc.compmap[currcomp], "%s")[0] except KeyError: tt = "" ttext = self.doc.compmap[name] % (text,) + tt if "%" in mod or "/" in mod: ttext = "" # Discard line number elif "#" in mod: ttext = text return ttext if name == "eol" and not currcomp == "eol": try: tt = string.split(self.doc.compmap[currcomp], "%s")[-1] except KeyError: tt = "" ttext = tt + self.doc.compmap[name] % (text,) if "%" in mod or "/" in mod: ttext = text return ttext if "#" in mod or "%" in mod: if name == "ws": text = " " * string.atoi(text) return text if name == "ws" and "/" in mod: text = " " * string.atoi(text) return text try: self.doc.option["token"] = self.mapChars(text) return self.doc.tokmap[name][1] % self.doc.option except KeyError: return self.mapChars(text) def matchComp(self, comp): """ Find one component matching tokens. The modifier specifies the type of the match (not, zero or more, single). Be aware that some modifiers are only interpreted by matcCompList. """ self.tok.checkPoint() self.tok.next() text = "" try: if "^" in comp[0]: if self.tok.current()[0] != comp[1]: text = self.mapTok(comp[0]) else: raise NoSymbolMatch elif "*" in comp[0]: self.tok.rollBack() # We can match * with zero tokens self.tok.checkPoint() try: while 1: self.tok.checkPoint() self.tok.next() if self.tok.current()[0] != comp[1]: self.tok.rollBack() break else: self.tok.commit() text = text + self.mapTok(comp[0]) except IndexError: pass else: if self.tok.current()[0] == comp[1]: text = self.mapTok(comp[0]) else: raise NoSymbolMatch except NoSymbolMatch: self.tok.rollBack() raise NoSymbolMatch else: self.tok.commit() return text def matchCompList(self, comp): """ Find a list component matching tokens. The modifier specifies the type of the match (not, zero or more, single, ...). """ self.tok.checkPoint() text = "" try: # ? and + are the same for both lists and strings if "?" in comp[0]: try: text = self.matchCompList((comp[0][1:], comp[1])) except NoSymbolMatch: pass elif "+" in comp[0]: try: text = self.matchCompList((comp[0][1:], comp[1])) except NoSymbolMatch: raise NoSymbolMatch else: try: ttext = self.matchCompList( ("*" + comp[0][1:], comp[1])) except NoSymbolMatch: pass else: text = text + ttext # Use matchComp if the component is not a list elif type(comp[1]) is StringType: try: text = self.matchComp(comp) except NoSymbolMatch: raise NoSymbolMatch # Ok, we know it is a list elif "|" in comp[0]: for (mod, ccomp) in comp[1]: try: text = self.matchCompList((mod + comp[0][1:], ccomp)) except NoSymbolMatch: continue else: break else: raise NoSymbolMatch elif "^" in comp[0]: self.tok.checkPoint() for (mod, ccomp) in comp[1]: try: ttext = self.matchCompList((mod + comp[0][1:], ccomp)) except NoSymbolMatch: self.tok.next() text = text + self.mapTok(comp[0]) self.tok.commit() break else: text = text + ttext else: self.tok.rollBack() raise NoSymbolMatch elif "*" in comp[0]: cont = 1 while cont: cont = 0; ttext = "" self.tok.checkPoint() for (mod, ccomp) in comp[1]: try: tttext = self.matchCompList( (mod + comp[0][1:], ccomp)) except NoSymbolMatch: self.tok.rollBack() break else: ttext = ttext + tttext else: self.tok.commit() text = text + ttext cont = 1 else: if comp[0]: mmod = comp[0] else: mmod = "" for (mod, ccomp) in comp[1]: try: ttext = self.matchCompList((mod + mmod, ccomp)) except NoSymbolMatch: raise NoSymbolMatch else: text = text + ttext except NoSymbolMatch: self.tok.rollBack() raise NoSymbolMatch else: self.tok.commit() return text class Comp: """ Generates components (a name and a texttupple) from tokens. """ def __init__(self, tokcomp=None): """ Initialize the TokComp class. """ if tokcomp: self.tokcomp = tokcomp else: self.tokcomp = TokComp() self.tok = self.tokcomp.tok self.src = self.tokcomp.src self.doc = self.tokcomp.doc self.ftokcomp = {} for index in range(len(self.src.srccomp)): tlist = self.src.srccomp[index][1] try: self.appendFtok(index, tlist) except KeyError: self.ftokcomp = {} break if self.ftokcomp: debug.write("Using speedup\n") def appendFtok(self, index, tlist): if tlist[0][0] != "": raise KeyError if type(tlist[0][1]) is StringType: try: self.ftokcomp[tlist[0][1]].append(index) except KeyError: self.ftokcomp[tlist[0][1]] = [index] else: self.appendFtok(index, tlist[0][1]) def fetchComp(self, comp): """ fetch a component by name. """ self.tok.checkPoint() texttupple = () try: for (mod, ccomp) in comp: try: text = self.tokcomp.matchCompList((mod, ccomp)) except NoSymbolMatch: self.tok.rollBack() raise NoSymbolMatch else: texttupple = texttupple + (text,) else: self.tok.commit() return texttupple except (KeyError, NoSymbolMatch): raise NoSymbolMatch def searchComp(self): """ Search for components from in src.srccomp. Use speedup if available (based on value of first token). """ comprange = [] if self.ftokcomp: self.tok.checkPoint() self.tok.next() try: comprange = self.ftokcomp[self.tok.current()[0]] except KeyError: pass self.tok.rollBack() if not comprange: comprange = range(len(self.src.srccomp)) for index in comprange: (name, comp) = self.src.srccomp[index] self.src.option["currcomp"] = name try: texttupple = self.fetchComp(comp) except NoSymbolMatch: continue else: return (name, texttupple) self.tok.next() self.src.option["currcomp"] = "" return ("", (self.tokcomp.mapTok(""),)) class CompDoc: """ Map characters, tokens and components to document format. """ def __init__(self, output=None, comp=None): """ Initialize the CompDoc class. """ if output: self.output = output else: self.output = sys.stdout if comp: self.comp = comp else: self.comp = Comp() self.doc = self.comp.doc def mapComp(self, name, texttupple): """ Using compmap to map language components to document components. """ try: return self.doc.compmap[name] % texttupple except KeyError: return string.joinfields(texttupple, "") def printDoc(self): debug.write("Starting\n") while 1: try: (name, texttupple) = self.comp.searchComp() sys.stdout.write(self.mapComp(name, texttupple)) except IndexError: break debug.write("\nDone\n") class Debug: """ Handy stuffs """ def __init__(self): """ Initialize the Debug class. """ self.write = sys.stderr.write # Make an instance of Debug debug = Debug() """ If this is the main file, we do it with default values. """ if __name__ == "__main__": CompDoc().printDoc()