tools/fileio.py was not tokenizing
[dwarf-doc.git] / dwarf5 / tools / fileio.py
index 9ec0b21..4eed82b 100644 (file)
@@ -1,6 +1,9 @@
+# Copyright 2012 DWARF Debugging Information Format Committee
 
 # All the little classes used in storing latex source data.
-# Copyright 2012 DWARF Debugging Information Format Committee
+# Reads in the tex source and builds internal lists of the
+# tokenized source.  The tokenization is adequate
+# for our purposes, but just barely adequate.
 
 import sys
 
@@ -37,13 +40,18 @@ def isIdNext(c):
   # : at the end of a DW* name on input.
   if c == ":":
     return "y"
+  # Do not allow \ in the middle of a name.
   if c == "\\":
-    return "y"
+    return "n"
   if c == "-":
     return "y"
   if c == "_":
     return "y"
   return "n"
+def isShift(c):
+  if ord(c) >= 128:
+    return "y"
+  return "n"
 def isIndivid(c):
   if c == "[":
     return "y"
@@ -59,38 +67,69 @@ def isIndivid(c):
     return "y"
   return "n"
 
+# self._tex        DW\-\_ATE and the like
+# self._underbar   DW\_ATE  and the like
+# self._std   the way a DW_ATE and the like looks in the standard
+# self._label With all _ and - removed.  Like DWATE
+
 class dwtoken:
+  """ Token types: 
+  id: identifier
+  ind: a character taken as an individual character.
+  none: No characters seen yet.
+  shift: A character with the high bit of 8 bits set, not something we expect.
+  -      In DW4 these high-bit-chars are special 3-character left and right quotes.
+  -      charfix.py  can replace these with Latex ascii quotes.
+  other: Some other character, but ascii, seemingly.  """
   def __init__(self):
     self._tex = []
     self._underbar = []
     self._std = []
     self._label = []
-    # Class is "id", "ind","other","none"
+    # Class is "id", "ind","other","shift","none"
     self._class = "none"
-  def insertid(self,string):
+    self._linenum = 0
+  def insertid(self,string,line):
     self._class =  "id"
     self._tex = list(string)
     self._underbar = self._tex
     self._std = self._tex
     self._label = self._tex
-  def setIndivid(self,c):
+    self._linenum = line
+  def setIndivid(self,c,line):
     self._tex = [c]
     self._underbar = [c]
     self._std = [c]
     self._label = [c]
     self._class =  "ind"
-  def setInitialIdChar(self,c):
+    self._linenum = line
+  def setInitialIdChar(self,c,line):
     self._tex = [c]
     self._class =  "id"
+    self._linenum = line
   def setNextIdChar(self,c):
     self._tex += [c]
 
-  def setInitialOther(self,c):
+  def setInitialShift(self,c,line):
+    self._tex = [c]
+    self._underbar = [c]
+    self._std = [c]
+    self._label = [c]
+    self._class =  "shift"
+    self._linenum = line
+  def setNextShift(self,c):
+    self._tex += [c]
+    self._underbar += [c]
+    self._std += [c]
+    self._label += [c]
+    self._class =  "shift"
+  def setInitialOther(self,c,line):
     self._tex = [c]
     self._underbar = [c]
     self._std = [c]
     self._label = [c]
     self._class =  "other"
+    self._linenum = line
   def setNextOther(self,c):
     self._tex += [c]
     self._underbar += [c]
@@ -156,20 +195,29 @@ class dwtoken:
 class  dwline:
   """using an input line, create a list of tokens for the line.
      Legal class transitions in tokenize() are:
+     none->shift
      none->other
      none->id
      none->ind
+
      other->ind
      other->id
+     other->shift
+
+     shift->id
+     shift->ind
+     shift->other
+
      id->ind
      id->other
+     id->shift
   """
   def __init__(self):
     # list of dwtoken.
     self._toks = []
 
   
-  def tokenize(self,rec):
+  def tokenize(self,rec,filename,linenum):
     """using an input line, create a list of tokens for the line.
        Legal class transitions in tokenize() are:
        none->other
@@ -178,6 +226,7 @@ class  dwline:
        other->ind
        other->id
        id->ind
+       id->id  
        id->other
     """
     dwclass = "none"
@@ -186,6 +235,8 @@ class  dwline:
     global keepcomments
     for c in rec:
       charnum = charnum +1
+      if ord(c) >= 128:
+        print " Warning: encountered character ord:",ord(c), "at offset",charnum,"line",linenum,filename
       if keepcomments == "d" and c == "%" and ( charnum == 0 or rec[charnum - 1] != "\\" ):  
         # Not keeping comments. We drop % and following to end of line 
         # unless preceeded by \ 
@@ -196,52 +247,102 @@ class  dwline:
           # would not be harmful.
           continue
       elif dwclass == "none" or dwclass == "ind":
+        if isShift(c) == "y":
+          combotok.setInitialShift(c,linenum)
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           a = dwtoken()
-          a.setIndivid(c);
+          a.setIndivid(c,linenum);
           self._toks += [a]
           continue
         if isIdStart(c) == "y":
-          combotok.setInitialIdChar(c)
+          combotok.setInitialIdChar(c,linenum)
           dwclass = "id"
           continue
         # is "other"
-        combotok.setInitialOther(c)
+        combotok.setInitialOther(c,linenum)
         dwclass = "other"
         continue
       elif dwclass == "id": 
         if isIdNext(c) == "y":
           combotok.setNextIdChar(c)
           continue
+        if isShift(c) == "y":
+          combotok.finishUpId()
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialShift(c,linenum);
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           combotok.finishUpId()
           self._toks += [combotok]
           combotok = dwtoken()
           a = dwtoken()
-          a.setIndivid(c);
+          a.setIndivid(c,linenum);
           dwclass = "ind"
           self._toks += [a]
           continue
+        if isIdStart(c) == "y":
+          # It is a valid initial character of an id.
+          # So we have id following id, like \a\a  
+          combotok.finishUpId()
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialIdChar(c,linenum)
+          dwclass = "id"
+          continue
         # Other class input, other starts here.
         combotok.finishUpId()
         self._toks += [combotok]
         combotok = dwtoken()
-        combotok.setInitialOther(c);
+        combotok.setInitialOther(c,linenum);
+        dwclass = "other"
+        continue
+      elif dwclass == "shift":
+        if isShift(c) == "y":
+          combotok.setNextShift(c);
+          continue
+        if isIndivid(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          a = dwtoken()
+          a.setIndivid(c,linenum);
+          dwclass = "ind"
+          self._toks += [a]
+          continue
+        if isIdStart(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialIdChar(c,linenum);
+          dwclass = "id"
+          continue
+        # Shift class input, other starts here.
+        self._toks += [combotok]
+        combotok = dwtoken()
+        combotok.setInitialOther(c,linenum);
         dwclass = "other"
         continue
       elif dwclass == "other":
+        if isShift(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialShift(c,linenum);
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           self._toks += [combotok]
           combotok = dwtoken()
           a = dwtoken()
-          a.setIndivid(c);
+          a.setIndivid(c,linenum);
           dwclass = "ind"
           self._toks += [a]
           continue
         if isIdStart(c) == "y":
           self._toks += [combotok]
           combotok = dwtoken()
-          combotok.setInitialIdChar(c);
+          combotok.setInitialIdChar(c,linenum);
           dwclass = "id"
           continue
         combotok.setNextOther(c);
@@ -253,6 +354,9 @@ class  dwline:
       combotok.finishUpId()
       self._toks += [combotok]
       dwclass = "none"
+    if dwclass == "shift":
+      self._toks += [combotok]
+      dwclass = "none"
     if dwclass == "other":
       self._toks += [combotok]
       dwclass = "none"
@@ -284,6 +388,7 @@ class dwfile:
     except IOError, message:
       print >> sys.stderr , "File could not be opened: ", name
       sys.exit(1)
+    linenum=0
     while 1:
       try:
         rec = file.readline()
@@ -292,9 +397,9 @@ class dwfile:
       if len(rec) < 1:
         # eof
         break
-
+      linenum = linenum +1
       aline = dwline()
-      aline.tokenize(rec)
+      aline.tokenize(rec,name,linenum)
       self._lines += [aline]
 
   def dwprint(self):
@@ -317,13 +422,26 @@ class dwfile:
     for l in self._lines:
       l.dwwrite(outfile,lnum)
       lnum = lnum + 1
+  # transformtoks looks at the file as a token sequence,
+  # not a line sequence.
+  # New view required by recent changes to .tex
+  def dwtransformfiletoks(self,callfunc,myfile):
+     FIXME 
+
+  def dwtransformtoks(self,callfunc,myfile):
+    globaltoklist = [] 
+    for l in self._lines:
+      for t in l._toks:
+        globaltoklist += [t]
+    toknum = 0
+    tokmax = len(globaltoklist)
+    self.dwtransformfiletoks(callfunc,myfile)
+
   def dwtransformline(self,callfunc,myfile):
-    lnum=1
+    lnum = 1
     for l in self._lines:
       l.dwtransformline(callfunc,myfile,lnum)
       lnum = lnum + 1
-    
-
 
 class dwfiles:
   def __init__(self):
@@ -344,6 +462,9 @@ class dwfiles:
   def dwtransformline(self,callfunc):
     for f in self._files:
       f.dwtransformline(callfunc,f)
+  def dwtransformtoks(self,callfunc):
+    for f in self._files:
+      f.dwtransformtoks(callfunc,f)
 
 
 def setkeepordeletecomments(val):