topazscripts 1.5

author some_updates <some_updates@gmail.com>

Wed, 20 Jan 2010 12:13:31 +0000 (12:13 +0000)

committer Apprentice Alf <apprenticealf@gmail.com>

Sat, 28 Feb 2015 12:13:49 +0000 (12:13 +0000)
author some_updates <some_updates@gmail.com>
Wed, 20 Jan 2010 12:13:31 +0000 (12:13 +0000)
committer Apprentice Alf <apprenticealf@gmail.com>
Sat, 28 Feb 2015 12:13:49 +0000 (12:13 +0000)
diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt

new file mode 100644 (file)

index 0000000..cc2f00a
--- /dev/null
+++ b/Topaz_Tools/lib/changes.txt
@@ -0,0 +1,20 @@
+Changes in version 1.5
+       - completely reworked generation of styles to use actual page heights and widths
+       - added new script getpagedim.py to support the above
+       - style names with underscores in them are now properly paired with their base class
+       - fixed hanging indents that did not ever set a left margin
+       - added support for a number of not previously known region types
+       - added support for a previously unknown snippet - <empty></empty>
+       - corrected a bug that caused unknown regions to abort the program
+       - added code to make the handling of unknown regions better in general
+       - corrected a bug that caused the last link on a page to be missing (if it was the last thing on the page)
+
+Changes in version 1.3
+       - font generation by gensvg.py is now greatly improved with support for contour points added
+       - support for more region types
+       - support for inline images in paragraphs or text fields (ie. initial graphics for the first letter of a word)
+       - greatly improved dtd information used for the xml to prevent parsing mistakes
+
+Version 1.0
+       - initial release
+
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py

index 4bec36fa2be7f900115b88229d8e7484087d1931..07741a71e1285b546ffaf6a71168c83bcf47e211 100644 (file)
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -93,7 +93,7 @@ def convert(i):
      for j in xrange(len(val)):
          c = ord(val[j:j+1])
          result += '%02x' % c
-        return result
+    return result
  
  
  
@@ -209,6 +209,8 @@ class PageParser(object):
          'wordStems'          : (0, 'number', 1, 1),
          'wordStems.stemID'   : (1, 'number', 0, 0),
  
+        'empty'          : (1, 'snippets', 1, 0),
+
          'page'           : (1, 'snippets', 1, 0),
          'page.pageid'    : (1, 'scalar_text', 0, 0),
          'page.pagelabel' : (1, 'scalar_text', 0, 0),
@@ -750,6 +752,7 @@ def main(argv):
  
      # read in the string table dictionary
      dict = Dictionary(dictFile)
+    # dict.dumpDict()
  
      # create a page parser
      pp = PageParser(pageFile, dict, debug, flat_xml)
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py

index f93318f9b0287aa497161ca327224b8b79f32985..f2dd244e243e9714f9b113f309fcc131540e151a 100644 (file)
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -90,20 +90,23 @@ class DocParser(object):
  
          # class names are an issue given topaz may start them with numerals (not allowed),
          # use a mix of cases (which cause some browsers problems), and actually
-        # attach numbers after "_reclustered*" to the end to deal with reflow issues
-        # but then not actually provide all of these _reclustereed classes in the stylesheet!
-
-        # so we clean this up by lowercasing, prepend 'cl_', and if not in the class
-        # list from the stylesheet, trying once more with "_reclustered*" removed
-        # if still not in stylesheet, let it pass as is
+        # attach numbers after "_reclustered*" to the end to deal classeses that inherit
+        # from a base class (but then not actually provide all of these _reclustereed 
+        # classes in the stylesheet!
+
+        # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
+        # that exists in the stylesheet first, and then adding this specific class
+        # after
+        classres = ''
          pclass = pclass.lower()
-        pclass = 'cl_' + pclass
-        if pclass not in self.classList:
-            p = pclass.find('_reclustered')
-            if p > 0 : 
-                baseclass = pclass[0:p]
-                if baseclass in self.classList:
-                    pclass = baseclass
+        pclass = 'cl-' + pclass
+        p = pclass.find('_')
+        if p > 0 :
+            baseclass = pclass[0:p]
+            if baseclass in self.classList:
+                classres += baseclass + ' '
+        classres += pclass
+        pclass = classres
  
          # build up a description of the paragraph in result and return it
          # first check for the  basic - all words paragraph
@@ -123,6 +126,12 @@ class DocParser(object):
          line = start + 1
          word_class = ''
  
+        # if end is -1 then we must search to end of document
+        if end == -1 :
+            docList = self.flatdoc
+            cnt = len(docList)
+            end = cnt
+
          while (line < end) :
  
              (name, argres) = self.lineinDoc(line)
@@ -139,7 +148,8 @@ class DocParser(object):
  
              elif name.endswith('word.class'):
                 (cname, space) = argres.split('-',1)
-               if cname == 'spaceafter':
+               if space == '' : space = '0'
+               if (cname == 'spaceafter') and (int(space) > 0) :
                     word_class = 'sa'
  
              elif name.endswith('word.img.src'):
@@ -166,7 +176,7 @@ class DocParser(object):
          sep =''
  
          br_lb = False
-        if (regtype == 'fixed') or (regtype == 'chapterheading') :
+        if (regtype == 'fixed') or (regtype == 'chapterheading'):
              br_lb = True
  
          handle_links = False
@@ -193,7 +203,8 @@ class DocParser(object):
                      link = self.link_id[num]
                      if (link > 0): 
                          title = self.link_title[link-1]
-                        if title == "": title='_link_'
+                        if (title == "") or (parares.rfind(title) < 0): 
+                            title='_link_'
                          ptarget = self.link_page[link-1] - 1
                          linkhtml = '<a href="#page%04d">' % ptarget
                          linkhtml += title + '</a>'
@@ -326,7 +337,7 @@ class DocParser(object):
                  htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
                  htmlpage += '</' + tag + '>'
  
-            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
+            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
                  ptype = 'full'
                  # check to see if this is a continution from the previous page
                  if (len(self.parastems_stemid) > 0):
@@ -348,7 +359,6 @@ class DocParser(object):
                  else :
                      htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
  
-
              elif (regtype == 'tocentry') :
                  ptype = 'full'
                  # check to see if this is a continution from the previous page
@@ -363,7 +373,7 @@ class DocParser(object):
                  (pclass, pdesc) = self.getParaDescription(start,end)
                  htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
  
-            elif regtype == 'synth_fcvr.center' :
+            elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
                  if not anchorSet:
                      htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                      anchorSet = True
@@ -373,30 +383,38 @@ class DocParser(object):
  
              else :
                  print 'Warning: Unknown region type', regtype
-                print 'Treating this like a "fixed" region'
-                regtype = 'fixed'
-                ptype = 'full'
-                # check to see if this is a continution from the previous page
-                if (len(self.parastems_stemid) > 0):
-                    ptype = 'end'
-                    self.parastems_stemid=[]
-                else:
+                (pos, temp) = self.findinDoc('paragraph',start,end)
+                if temp:
+                    print 'Treating this like a "text" region'
+                    regtype = 'fixed'
+                    ptype = 'full'
+                    # check to see if this is a continution from the previous page
+                    if (len(self.parastems_stemid) > 0):
+                        ptype = 'end'
+                        self.parastems_stemid=[]
+                    else:
+                        if not anchorSet:
+                            htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                            anchorSet = True
+                    (pclass, pdesc) = self.getParaDescription(start,end)
+                    if ptype == 'full' :
+                        tag = 'p'
+                        if pclass[3:6] == 'h1-' : tag = 'h4'
+                        if pclass[3:6] == 'h2-' : tag = 'h5'
+                        if pclass[3:6] == 'h3-' : tag = 'h6'
+                        htmlpage += '<' + tag + ' class="' + pclass + '">'
+                        htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
+                        htmlpage += '</' + tag + '>'
+                    else :
+                        htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+                else :
+                    print 'Treating this like a "image" region'
                      if not anchorSet:
                          htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
                          anchorSet = True
-                (pclass, desc) = self.getParaDescription(start,end)
-                if ptype == 'full' :
-                    tag = 'p'
-                    if pclass[3:6] == 'h1-' : tag = 'h4'
-                    if pclass[3:6] == 'h2-' : tag = 'h5'
-                    if pclass[3:6] == 'h3-' : tag = 'h6'
-                    htmlpage += '<' + tag + ' class="' + pclass + '">'
-                    htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
-                    htmlpage += '</' + tag + '>'
-                else :
-                    htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
-
+                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                    if simgsrc:
+                        htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
  
          if len(self.paracont_stemid) > 0 :
              if htmlpage[-4:] == '</p>':
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py

index 05261c93a3bf6baac1c1fa8858b2ea37788493cb..df395390bf09c3ce8f602e086504ea6980716745 100644 (file)
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@@ -8,7 +8,7 @@ import convert2xml
  import flatxml2html
  import decode_meta
  import stylexml2css
-
+import getpagedim
  
  def usage():
      print 'Usage: '
@@ -86,6 +86,7 @@ def main(argv):
  
      htmlstr += '<head>\n'
  
+    # process metadata and retrieve fontSize info
      print '     ', 'metadata0000.dat'
      fname = os.path.join(bookDir,'metadata0000.dat')
      xname = os.path.join(bookDir, 'metadata.txt')
@@ -100,12 +101,27 @@ def main(argv):
      if 'fontSize' in meta_array:
          fontsize = meta_array['fontSize']
  
+    # also get the size of a normal text page
+    spage = '1'
+    if 'firstTextPage' in meta_array:
+        spage = meta_array['firstTextPage']
+    pnum = int(spage)
+
+    # get page height and width from first text page for use in stylesheet scaling
+    pname = 'page%04d.dat' % pnum
+    fname = os.path.join(pageDir,pname)
+    flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+    (ph, pw) = getpagedim.getPageDim(flat_xml)
+    if (ph == '-1') : ph = 11000
+    if (pw == '-1') : pw = 8500
+
+    # now build up the style sheet
      print '     ', 'other0000.dat'
      fname = os.path.join(bookDir,'other0000.dat')
      xname = os.path.join(bookDir, 'style.css')
      xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
      htmlstr += '<style>\n'
-    cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize)
+    cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize, ph, pw)
      file(xname, 'wb').write(cssstr)
      htmlstr += cssstr
      htmlstr += '</style>\n'
diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py

new file mode 100644 (file)

index 0000000..dd1071c
--- /dev/null
+++ b/Topaz_Tools/lib/getpagedim.py
@@ -0,0 +1,53 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+    def __init__(self, flatxml):
+        self.flatdoc = flatxml.split('\n')
+
+
+    # find tag if within pos to end inclusive
+    def findinDoc(self, tagpath, pos, end) :
+        result = None
+        docList = self.flatdoc
+        cnt = len(docList)
+        if end == -1 :
+            end = cnt
+        else:
+            end = min(cnt,end)
+        foundat = -1
+        for j in xrange(pos, end):
+            item = docList[j]
+            if item.find('=') >= 0:
+                (name, argres) = item.split('=')
+            else : 
+                name = item
+                argres = ''
+            if name.endswith(tagpath) : 
+                result = argres
+                foundat = j
+                break
+        return foundat, result
+
+    def process(self):
+        (pos, sph) = self.findinDoc('page.h',0,-1)
+        (pos, spw) = self.findinDoc('page.w',0,-1)
+        if (sph == None): sph = '-1'
+        if (spw == None): spw = '-1'
+        return sph, spw
+
+
+def getPageDim(flatxml):
+    # create a document parser
+    dp = DocParser(flatxml)
+    (ph, pw) = dp.process()
+    return ph, pw
diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt

index afe4a5a7fcda8a3d53464fe1720b1b7975b93e07..c9fcb611265d74a0246854adce3b52a9a986b252 100644 (file)
--- a/Topaz_Tools/lib/readme.txt
+++ b/Topaz_Tools/lib/readme.txt
@@ -3,7 +3,8 @@ Contributors:
       clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
       skindle - for figuing out the general case for the mode loops
       some updates -  for conversion to xml, basic html
-     DiapDealer - for extensive testing and feeback
+     DiapDealer - for extensive testing and feedback
+     stewball - for extensive testing and feedback
  
  and others for posting, feedback and testing
    
@@ -23,12 +24,13 @@ decode_meta.py - converts metadata0000.dat to human readable text (for the most
  convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
  flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
  stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
+getpagedim.py - reads page0000.dat to get the book height and width parameters
  genxml.py - main program to convert everything to xml
  genhtml.py - main program to generate "book.html"
  gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
  
  Please note, gensvg.py, genhtml.py, and genxml.py import and use
-decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py 
+decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py 
  so please keep all of these python scripts together in the same place.
  
  
diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py

index ede6767a3899ad9f6388d2083d063f271c50801c..0d2739b45d184a30428d30b1b7876ae1f9950d99 100644 (file)
--- a/Topaz_Tools/lib/stylexml2css.py
+++ b/Topaz_Tools/lib/stylexml2css.py
@@ -11,9 +11,11 @@ from struct import unpack
  
  
  class DocParser(object):
-    def __init__(self, flatxml, fontsize):
+    def __init__(self, flatxml, fontsize, ph, pw):
          self.flatdoc = flatxml.split('\n')
          self.fontsize = int(fontsize)
+        self.ph = int(ph) * 1.0
+        self.pw = int(pw) * 1.0
  
      stags = {
          'paragraph' : 'p',
@@ -106,14 +108,14 @@ class DocParser(object):
                  # get the style class
                  (pos, sclass) = self.findinDoc('style.class',start,end)
                  if sclass != None:
-                    sclass = '.cl_' + sclass.lower()
+                    sclass = '.cl-' + sclass.lower()
                  else : 
                      sclass = ''
  
                  # check for any "after class" specifiers
                  (pos, aftclass) = self.findinDoc('style._after_class',start,end)
                  if aftclass != None:
-                    aftclass = '.cl_' + aftclass.lower()
+                    aftclass = '.cl-' + aftclass.lower()
                  else : 
                      aftclass = ''
  
@@ -121,8 +123,8 @@ class DocParser(object):
  
                  while True :
  
-                    (pos, attr) = self.findinDoc('style.rule.attr', start, end)
-                    (pos, val) = self.findinDoc('style.rule.value', start, end)
+                    (pos1, attr) = self.findinDoc('style.rule.attr', start, end)
+                    (pos2, val) = self.findinDoc('style.rule.value', start, end)
  
                      if attr == None : break
                      
@@ -135,28 +137,34 @@ class DocParser(object):
                          # handle value based attributes
                          if attr in self.attr_val_map :
                              name = self.attr_val_map[attr]
-                            scale = self.fontsize
-                            if attr == 'line-space': scale = scale * 1.41
+                            if attr in ('margin-bottom', 'margin-top', 'space-after') :
+                                scale = self.ph
+                            elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
+                                scale = self.pw
+                            elif attr == 'line-space':
+                                scale = self.fontsize * 2.0
+
                              if not ((attr == 'hang') and (int(val) == 0)) :
-                                ems = int(val)/scale
-                                cssargs[attr] = (self.attr_val_map[attr], ems)
+                                pv = float(val)/scale
+                                cssargs[attr] = (self.attr_val_map[attr], pv)
                                  keep = True
  
-                    start = pos + 1
+                    start = max(pos1, pos2) + 1
  
                  # disable all of the after class tags until I figure out how to handle them
                  if aftclass != "" : keep = False
  
                  if keep :
-                    # make sure line-space does not go below 1em
+                    # make sure line-space does not go below 100% or above 300% since 
+                    # it can be wacky in some styles
                      if 'line-space' in cssargs:
                          seg = cssargs['line-space'][0]
                          val = cssargs['line-space'][1]
                          if val < 1.0: val = 1.0
+                        if val > 3.0: val = 3.0
                          del cssargs['line-space']
                          cssargs['line-space'] = (self.attr_val_map['line-space'], val)
  
-
                      
                      # handle modifications for css style hanging indents
                      if 'hang' in cssargs:
@@ -166,11 +174,13 @@ class DocParser(object):
                          cssargs['hang'] = (self.attr_val_map['hang'], -hval)
                          mval = 0
                          mseg = 'margin-left: '
+                        mval = hval
                          if 'margin-left' in cssargs:
                              mseg = cssargs['margin-left'][0]
                              mval = cssargs['margin-left'][1]
+                            if mval < 0: mval = 0
                              mval = hval + mval
-                            cssargs['margin-left'] = (mseg, mval)
+                        cssargs['margin-left'] = (mseg, mval)
                          if 'indent' in cssargs:
                              del cssargs['indent']
  
@@ -181,7 +191,7 @@ class DocParser(object):
                          if mval == '':
                              cssline += mseg + ' '
                          else :
-                            aseg = mseg + '%.1fem;' % mval
+                            aseg = mseg + '%.1f%%;' % (mval * 100.0)
                              cssline += aseg + ' '
  
                      cssline += '}'
@@ -213,10 +223,14 @@ class DocParser(object):
  
  
  
-def convert2CSS(flatxml, fontsize):
+def convert2CSS(flatxml, fontsize, ph, pw):
+
+    print '          ', 'Using font size:',fontsize
+    print '          ', 'Using page height:', ph
+    print '          ', 'Using page width:', pw
  
      # create a document parser
-    dp = DocParser(flatxml, fontsize)
+    dp = DocParser(flatxml, fontsize, ph, pw)
  
      csspage = dp.process()
author	some_updates <some_updates@gmail.com>
	Wed, 20 Jan 2010 12:13:31 +0000 (12:13 +0000)
committer	Apprentice Alf <apprenticealf@gmail.com>
	Sat, 28 Feb 2015 12:13:49 +0000 (12:13 +0000)
Topaz_Tools/lib/changes.txt	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/convert2xml.py		patch \| blob \| blame \| history
Topaz_Tools/lib/flatxml2html.py		patch \| blob \| blame \| history
Topaz_Tools/lib/genhtml.py		patch \| blob \| blame \| history
Topaz_Tools/lib/getpagedim.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/readme.txt		patch \| blob \| blame \| history
Topaz_Tools/lib/stylexml2css.py		patch \| blob \| blame \| history