Adding to the tools, in the tools directory.
authorDavid Anderson <davea42@earthlink.net>
Thu, 6 Dec 2012 23:35:25 +0000 (15:35 -0800)
committerDavid Anderson <davea42@earthlink.net>
Thu, 6 Dec 2012 23:35:25 +0000 (15:35 -0800)
The new tool can fix up  the utf-8 introduced
accidentally by copy/paste into the
current .tex. Changing it into proper latex
commands so all will then look right.

No change to any .tex file in this commit.

dwarf5/tools/charfix.py [new file with mode: 0644]
dwarf5/tools/charfix.sh [new file with mode: 0644]
dwarf5/tools/fileio.py

diff --git a/dwarf5/tools/charfix.py b/dwarf5/tools/charfix.py
new file mode 100644 (file)
index 0000000..b7c785e
--- /dev/null
@@ -0,0 +1,242 @@
+# Copyright 2012 DWARF Debugging Information Format Committee
+#
+# 
+# 
+# Looks for certain multi-byte chars and replaces with
+# appropriate ascii.
+# See "The Comprehensive LaTeX Symbol List"
+#http://www.tex.ac.uk/tex-archive/info/symbols/comprehensive/symbols-a4.pdf
+
+#\newcommand{\singlequote}[1]{\textquitedblleft#1\textquotedblright}
+#\newcommand{\doublequote}[1]{\textquoteleft#1\textquoteright}
+
+#
+#  utf         latex                 description
+# e2 80 9c      \textquotedblleft    (two curly left quote chars)  
+# e2 80 9d      \textquotedblright   (two curly right quote chars)
+#              or \doublequote{stringtoquote} (our command)
+# e2 80 99      \textquoteleft       (single curly left quote char)
+# e2 80 9b      \textquoteright      (single curly right quote quote char)
+#      by itself for contractions like don't
+#      or \singlequote{stringtoquote} for a left right pair (our command)
+#
+# e2 80 93       \textendash         (minus sign)
+# e2 80 94      \textemdash          (long dash         )
+#        \textthreequartersemdash (long dash, from textcomp package)
+#               \textendash          (long dash         )
+#               \leftrightline (from MnSymbol package)
+# e2 80 a6     \dots         (...)
+#                 \textellipsis  is also usable.
+# e2 84 a2      \texttrademark      trademark symbol
+# e2 ?? ??      \copyright          (copyright symbol)
+
+import sys
+import fileio
+
+
+def ischar(tok,c):
+   if tok._class != "ind":
+      return "n"
+   if len(tok._tex) != 1:
+       return "n"
+   if tok._tex[0] != c:
+       return "n"
+   return "y"
+
+
+def append_to_out(out,addthese):
+  for a in addthese:
+    out += [a]
+
+def isutf80prefix(t):
+  if t[0] != chr(226):
+    return "n"
+  if t[1] != chr(128):
+    return "n"
+  return "y"
+def isutf84prefix(t):
+  if t[0] != chr(226):
+    return "n"
+  if t[1] != chr(132):
+    return "n"
+  return "y"
+
+def isutfleftdouble(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(156):
+    return "n"
+  return "y"
+def isutfrightdouble(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(157):
+    return "n"
+  return "y"
+def isutfleftsingle(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(155):
+    return "n"
+  return "y"
+def isutfrightsingle(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(153):
+    return "n"
+  return "y"
+def isutfdash(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(148):
+    return "n"
+  return "y"
+def isutfminus(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(147):
+    return "n"
+  return "y"
+def isutftrademark(t):
+  if isutf84prefix(t) != "y":
+    return "n"
+  if t[2] != chr(162):
+    return "n"
+  return "y"
+  
+def isutfdots(t):
+  if isutf80prefix(t) != "y":
+    return "n"
+  if t[2] != chr(166):
+    return "n"
+  return "y"
+
+# Following just for a unique single-right-quote case.
+def maybeinsertnonbreakspace(outtoks,linetoks,nexttoknumin,lasttoknum):
+  """ after the word 'variables' and a quote, insert non break space
+  so things look ok. Bit of a hack to take care of one case. """
+  # Check 2 since it is counting the one after current and
+  # we look before current.
+  if nexttoknumin < 2:
+    return
+  t = linetoks[nexttoknumin-2]
+  rawtok = ''.join(t._tex)
+  if rawtok != "variables":
+    return
+  t1=fileio.dwtoken()
+  t1.setInitialOther("\\")
+  t1.setNextOther(" ")
+  append_to_out(outtoks,[t1])
+  
+def maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum):
+  if nexttoknumin > lasttoknum:
+    return
+  t = linetoks[nexttoknumin]
+  rawtok = ''.join(t._tex)
+  if rawtok[0] == " ":
+    return
+  t1=fileio.dwtoken()
+  t1.setIndivid(" ")
+  append_to_out(outtoks,[t1])
+  return
+def transfunc(linetoks,myfile,linenum):
+  if len(linetoks) < 1:
+    return linetoks
+  tnumin = 0
+  changes = 0
+  lasttoknum = len(linetoks) -1
+  outtoks = []
+  nexttoknumin = 0 
+  for t in linetoks:
+    nexttoknumin = nexttoknumin + 1
+    rawtok = ''.join(t._tex)
+    #stdname= ''.join(t._std)
+    #linkname = "chap:" + ''.join(t._label)
+    if len(rawtok) == 3:
+        if isutfleftdouble(rawtok) == "y":
+            t1=fileio.dwtoken()
+            t1.insertid("\\doublequote")
+            t2=fileio.dwtoken()
+            t2.setIndivid("{")
+            append_to_out(outtoks,[t1])
+            append_to_out(outtoks,[t2])
+            changes = changes +  1
+        elif isutfrightdouble(rawtok) == "y":
+            t4=fileio.dwtoken()
+            t4.setIndivid("}")
+            append_to_out(outtoks,[t4])
+            changes = changes +  1
+        elif isutfleftsingle(rawtok) == "y":
+            # Here, odd trailing space is so next char does not hit
+            # following word in output.
+            # Sometimes quote right single is in a contraction
+            # not part of a pair, so we don't try to pair them
+            # here with\singlequote{}
+            t1=fileio.dwtoken()
+            t1.insertid("\\textquoteleft")
+            append_to_out(outtoks,[t1])
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        elif isutfrightsingle(rawtok) == "y":
+            t1=fileio.dwtoken()
+            t1.insertid("\\textquoteright")
+            append_to_out(outtoks,[t1])
+            maybeinsertnonbreakspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        elif isutfdash(rawtok) == "y":
+            t1=fileio.dwtoken()
+            t1.insertid("\\textemdash" )
+            append_to_out(outtoks,[t1])
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        elif isutfminus(rawtok) == "y":
+            t1=fileio.dwtoken()
+            t1.insertid("\\textendash")
+            append_to_out(outtoks,[t1])
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        elif isutftrademark(rawtok) == "y":
+            # Force a non-break space after the TM symbol
+            # so the output has a space for real.
+            t1=fileio.dwtoken()
+            t1.insertid("\\texttrademark\\ ")
+            append_to_out(outtoks,[t1])
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        elif isutfdots(rawtok) == "y":
+            t1=fileio.dwtoken()
+            t1.insertid("\\dots")
+            append_to_out(outtoks,[t1])
+            maybeinsertspace(outtoks,linetoks,nexttoknumin,lasttoknum)
+            changes = changes +  1
+        else:
+            outtoks += [t]
+    else:
+      outtoks += [t]
+    tnumin = tnumin+ 1
+    # End of for loop.
+  return outtoks
+
+def process_files(filelist):
+  dwf = fileio.readFilelist(filelist)
+  dwf.dwtransformline(transfunc)
+  dwf.dwwrite()
+
+def read_all_args():
+  filelist = []
+  cur = 1
+  while  len(sys.argv) > cur:
+    v = sys.argv[cur]
+    filelist += [v]
+    cur = int(cur) + 1
+  if len(filelist) < 1:
+    print >> sys.stderr , "No files specified."
+    sys.exit(1)
+  process_files(filelist)
+
+#  anylink [-t <class>] ... [file] ...
+
+if __name__ == '__main__':
+  read_all_args()
+
diff --git a/dwarf5/tools/charfix.sh b/dwarf5/tools/charfix.sh
new file mode 100644 (file)
index 0000000..30298c5
--- /dev/null
@@ -0,0 +1,6 @@
+
+for i in ../latexdoc/*.tex
+do
+    python charfix.py $i
+    mv $i.out $i
+done
index 7609804..eb6a19d 100644 (file)
@@ -47,6 +47,10 @@ def isIdNext(c):
   if c == "_":
     return "y"
   return "n"
+def isShift(c):
+  if ord(c) >= 128:
+    return "y"
+  return "n"
 def isIndivid(c):
   if c == "[":
     return "y"
@@ -63,12 +67,20 @@ def isIndivid(c):
   return "n"
 
 class dwtoken:
+  """ Token types: 
+  id: identifier
+  ind: a character taken as an individual character.
+  none: No characters seen yet.
+  shift: A character with the high bit of 8 bits set, not something we expect.
+  -      In DW4 these high-bit-chars are special 3-character left and right quotes.
+  -      charfix.py  can replace these with Latex ascii quotes.
+  other: Some other character, but ascii, seemingly.  """
   def __init__(self):
     self._tex = []
     self._underbar = []
     self._std = []
     self._label = []
-    # Class is "id", "ind","other","none"
+    # Class is "id", "ind","other","shift","none"
     self._class = "none"
   def insertid(self,string):
     self._class =  "id"
@@ -88,6 +100,18 @@ class dwtoken:
   def setNextIdChar(self,c):
     self._tex += [c]
 
+  def setInitialShift(self,c):
+    self._tex = [c]
+    self._underbar = [c]
+    self._std = [c]
+    self._label = [c]
+    self._class =  "shift"
+  def setNextShift(self,c):
+    self._tex += [c]
+    self._underbar += [c]
+    self._std += [c]
+    self._label += [c]
+    self._class =  "shift"
   def setInitialOther(self,c):
     self._tex = [c]
     self._underbar = [c]
@@ -159,20 +183,29 @@ class dwtoken:
 class  dwline:
   """using an input line, create a list of tokens for the line.
      Legal class transitions in tokenize() are:
+     none->shift
      none->other
      none->id
      none->ind
+
      other->ind
      other->id
+     other->shift
+
+     shift->id
+     shift->ind
+     shift->other
+
      id->ind
      id->other
+     id->shift
   """
   def __init__(self):
     # list of dwtoken.
     self._toks = []
 
   
-  def tokenize(self,rec):
+  def tokenize(self,rec,filename,linenum):
     """using an input line, create a list of tokens for the line.
        Legal class transitions in tokenize() are:
        none->other
@@ -189,6 +222,8 @@ class  dwline:
     global keepcomments
     for c in rec:
       charnum = charnum +1
+      if ord(c) >= 128:
+        print " Warning: encountered character ord:",ord(c), "at offset",charnum,"line",linenum,filename
       if keepcomments == "d" and c == "%" and ( charnum == 0 or rec[charnum - 1] != "\\" ):  
         # Not keeping comments. We drop % and following to end of line 
         # unless preceeded by \ 
@@ -199,6 +234,10 @@ class  dwline:
           # would not be harmful.
           continue
       elif dwclass == "none" or dwclass == "ind":
+        if isShift(c) == "y":
+          combotok.setInitialShift(c)
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           a = dwtoken()
           a.setIndivid(c);
@@ -216,6 +255,13 @@ class  dwline:
         if isIdNext(c) == "y":
           combotok.setNextIdChar(c)
           continue
+        if isShift(c) == "y":
+          combotok.finishUpId()
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialShift(c);
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           combotok.finishUpId()
           self._toks += [combotok]
@@ -232,7 +278,37 @@ class  dwline:
         combotok.setInitialOther(c);
         dwclass = "other"
         continue
+      elif dwclass == "shift":
+        if isShift(c) == "y":
+          combotok.setNextShift(c);
+          continue
+        if isIndivid(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          a = dwtoken()
+          a.setIndivid(c);
+          dwclass = "ind"
+          self._toks += [a]
+          continue
+        if isIdStart(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialIdChar(c);
+          dwclass = "id"
+          continue
+        # Shift class input, other starts here.
+        self._toks += [combotok]
+        combotok = dwtoken()
+        combotok.setInitialOther(c);
+        dwclass = "other"
+        continue
       elif dwclass == "other":
+        if isShift(c) == "y":
+          self._toks += [combotok]
+          combotok = dwtoken()
+          combotok.setInitialShift(c);
+          dwclass = "shift"
+          continue
         if isIndivid(c) == "y":
           self._toks += [combotok]
           combotok = dwtoken()
@@ -256,6 +332,9 @@ class  dwline:
       combotok.finishUpId()
       self._toks += [combotok]
       dwclass = "none"
+    if dwclass == "shift":
+      self._toks += [combotok]
+      dwclass = "none"
     if dwclass == "other":
       self._toks += [combotok]
       dwclass = "none"
@@ -287,6 +366,7 @@ class dwfile:
     except IOError, message:
       print >> sys.stderr , "File could not be opened: ", name
       sys.exit(1)
+    linenum=0
     while 1:
       try:
         rec = file.readline()
@@ -295,9 +375,9 @@ class dwfile:
       if len(rec) < 1:
         # eof
         break
-
+      linenum = linenum +1
       aline = dwline()
-      aline.tokenize(rec)
+      aline.tokenize(rec,name,linenum)
       self._lines += [aline]
 
   def dwprint(self):