tools v1.5

author Apprentice Alf <apprenticealf@gmail.com>

Tue, 2 Mar 2010 12:46:56 +0000 (12:46 +0000)

committer Apprentice Alf <apprenticealf@gmail.com>

Mon, 2 Mar 2015 07:43:31 +0000 (07:43 +0000)
author Apprentice Alf <apprenticealf@gmail.com>
Tue, 2 Mar 2010 12:46:56 +0000 (12:46 +0000)
committer Apprentice Alf <apprenticealf@gmail.com>
Mon, 2 Mar 2015 07:43:31 +0000 (07:43 +0000)
diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py

index 83301ddd8c57c3cd777d0e02b860e817361d58b6..d7cef99db8a2d6b5eba6321eadb319a2b83d7749 100644 (file)
--- a/Topaz_Tools/lib/cmbtc_dump.py
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@@ -1,5 +1,5 @@
  #! /usr/bin/python
-# For use in Topaz Scripts version 2.3
+# For use in Topaz Scripts version 2.6
  
  """
  
diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py

index 1508741cb1c47a63246278ea120a0e4a6d4cd25c..0d624043649cc4eab60c5407a3e427388ed2d18b 100644 (file)
--- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
+++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
@@ -1,5 +1,5 @@
  #!/usr/bin/python
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  class Unbuffered:
      def __init__(self, stream):
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py

index 18ae3f02ba369c9c97e3e2b4073047954ba46224..e3f0fe2fde96c4055458e3c66026d90e79040426 100644 (file)
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.4
+# For use with Topaz Scripts Version 2.6
  
  class Unbuffered:
      def __init__(self, stream):
@@ -315,6 +315,12 @@ class PageParser(object):
          'version.findlists'                : (1, 'scalar_text', 0, 0),
          'version.page_num'                 : (1, 'scalar_text', 0, 0),
          'version.page_type'                : (1, 'scalar_text', 0, 0),
+        'version.bad_text'                 : (1, 'scalar_text', 0, 0),
+        'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0),
+        'version.margins'                  : (1, 'scalar_text', 0, 0),
+        'version.staggered_lines'          : (1, 'scalar_text', 0, 0),
+        'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0),
+        'version.toc'                      : (1, 'scalar_text', 0, 0),
  
          'stylesheet'   : (1, 'snippets', 1, 0),
          'style'              : (1, 'snippets', 1, 0),
@@ -662,16 +668,19 @@ class PageParser(object):
      def process(self):
  
          # peek at the first bytes to see what type of file it is
-        magic = self.fo.read(11)
-        if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
+        magic = self.fo.read(9)
+        if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
              first_token = 'info'
-        elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
-            skip = self.fo.read(1)
+        elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
+            skip = self.fo.read(2)
+            first_token = 'info'
+        elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
+            skip = self.fo.read(3)
              first_token = 'info'
          else :
              # other0.dat file
              first_token = None
-            self.fo.seek(-11,1)
+            self.fo.seek(-9,1)
  
  
          # main loop to read and build the document tree
diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py

index 038f1338d91c42d62a23bbc581ccfd3c9133b39f..a63c5788ae876aabdec8e7049a344fe7827f7a72 100644 (file)
--- a/Topaz_Tools/lib/decode_meta.py
+++ b/Topaz_Tools/lib/decode_meta.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  import csv
  import sys
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py

index 0fb106dc396c87e3283b719444c8de2549c74abb..1c4419f12940c3c345818ab12c30cefb5ef0be6d 100644 (file)
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  import sys
  import csv
@@ -32,6 +32,8 @@ class DocParser(object):
          self.link_id = []
          self.link_title = []
          self.link_page = []
+        self.link_href = []
+        self.link_type = []
          self.dehyphen_rootid = []
          self.paracont_stemid = []
          self.parastems_stemid = []
@@ -197,6 +199,7 @@ class DocParser(object):
      # get the class
      def getClass(self, pclass):
          nclass = pclass
+
          # class names are an issue given topaz may start them with numerals (not allowed),
          # use a mix of cases (which cause some browsers problems), and actually
          # attach numbers after "_reclustered*" to the end to deal classeses that inherit
@@ -206,7 +209,10 @@ class DocParser(object):
          # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
          # that exists in the stylesheet first, and then adding this specific class
          # after
+        
+        # also some class names have spaces in them so need to convert to dashes
          if nclass != None :
+            nclass = nclass.replace(' ','-')
              classres = ''
              nclass = nclass.lower()
              nclass = 'cl-' + nclass
@@ -334,7 +340,7 @@ class DocParser(object):
              result.append(('svg', num))
              return pclass, result
  
-        # this type of paragrph may be made up of multiple spans, inline 
+        # this type of paragraph may be made up of multiple spans, inline 
          # word monograms (images), and words with semantic meaning, 
          # plus glyphs used to form starting letter of first word
          
@@ -391,6 +397,9 @@ class DocParser(object):
                  result.append(('img' + word_class, int(argres)))
                  word_class = ''
  
+            elif name.endswith('region.img.src'):
+                result.append(('img' + word_class, int(argres)))
+
              if (sp_first != -1) and (sp_last != -1):
                  for wordnum in xrange(sp_first, sp_last):
                      result.append(('ocr', wordnum))
@@ -437,6 +446,8 @@ class DocParser(object):
          if (type == 'end'):
              parares += ' '
  
+        lstart = len(parares)
+
          cnt = len(pdesc)
  
          for j in xrange( 0, cnt) :
@@ -449,18 +460,24 @@ class DocParser(object):
  
                  if handle_links:
                      link = self.link_id[num]
-                    if (link > 0): 
+                    if (link > 0):
+                        linktype = self.link_type[link-1]
                          title = self.link_title[link-1]
-                        if (title == "") or (parares.rfind(title) < 0): 
-                            title='_link_'
-                        ptarget = self.link_page[link-1] - 1
-                        linkhtml = '<a href="#page%04d">' % ptarget
+                        if (title == "") or (parares.rfind(title) < 0):
+                            title=parares[lstart:]
+                        if linktype == 'external' :
+                            linkhref = self.link_href[link-1]
+                            linkhtml = '<a href="%s">' % linkhref
+                        else :
+                            ptarget = self.link_page[link-1] - 1
+                            linkhtml = '<a href="#page%04d">' % ptarget
                          linkhtml += title + '</a>'
                          pos = parares.rfind(title)
                          if pos >= 0:
                              parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
                          else :
                              parares += linkhtml
+                        lstart = len(parares)
                          if word == '_link_' : word = ''
                      elif (link < 0) :
                          if word == '_link_' : word = ''
@@ -532,6 +549,14 @@ class DocParser(object):
          # collect link destination page numbers
          self.link_page = self.getData('info.links.page',0,-1)
  
+        # collect link types (container versus external)
+        (pos, argres) = self.findinDoc('info.links.type',0,-1)
+        if argres :  self.link_type = argres.split('|')
+
+        # collect link destinations
+        (pos, argres) = self.findinDoc('info.links.href',0,-1)
+        if argres :  self.link_href = argres.split('|')
+
          # collect link titles
          (pos, argres) = self.findinDoc('info.links.title',0,-1)
          if argres :
@@ -641,16 +666,18 @@ class DocParser(object):
                      htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
  
  
-                elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
+                elif (regtype == 'synth_fcvr.center'):
                      (pos, simgsrc) = self.findinDoc('img.src',start,end)
                      if simgsrc:
                          htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
  
                  else :
-                    print 'Warning: region type', regtype
+                    print '          Making region type', regtype,
                      (pos, temp) = self.findinDoc('paragraph',start,end)
-                    if pos != -1:
-                        print '   is a "text" region'
+                    (pos2, temp) = self.findinDoc('span',start,end)
+                    if pos != -1 or pos2 != -1:
+                        print ' a "text" region'
+                        orig_regtype = regtype
                          regtype = 'fixed'
                          ptype = 'full'
                          # check to see if this is a continution from the previous page
@@ -658,6 +685,11 @@ class DocParser(object):
                              ptype = 'end'
                              first_para_continued = False
                          (pclass, pdesc) = self.getParaDescription(start,end, regtype)
+                        if not pclass:
+                            if orig_regtype.endswith('.right')     : pclass = 'cl-right'
+                            elif orig_regtype.endswith('.center')  : pclass = 'cl-center'
+                            elif orig_regtype.endswith('.left')    : pclass = 'cl-left'
+                            elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
                          if pclass and (ptype == 'full') and (len(pclass) >= 6):
                              tag = 'p'
                              if pclass[3:6] == 'h1-' : tag = 'h4'
@@ -669,7 +701,7 @@ class DocParser(object):
                          else :
                              htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
                      else :
-                        print '    is a "graphic" region'
+                        print ' a "graphic" region'
                          (pos, simgsrc) = self.findinDoc('img.src',start,end)
                          if simgsrc:
                              htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py

index 82b2c728d8251805bc237a9fea2482b3be5e098d..b3cf9409695d96c8ba9b937e0e93707002063190 100644 (file)
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  class Unbuffered:
      def __init__(self, stream):
diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py

index 040fe9bbea18bb1d8a0345aafb20420ac7303a66..70f82b4cd7e2a925fb4984ccbed471d6f0fa6076 100644 (file)
--- a/Topaz_Tools/lib/gensvg.py
+++ b/Topaz_Tools/lib/gensvg.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  class Unbuffered:
      def __init__(self, stream):
diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py

index a30c6308749e16a9acf5b411cd9fac859dfbb9af..be542f08815c995c89d962f7220844ca6e32552d 100644 (file)
--- a/Topaz_Tools/lib/genxml.py
+++ b/Topaz_Tools/lib/genxml.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  class Unbuffered:
      def __init__(self, stream):
diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py

index af2a6f6ec91dae0e1b54825a505c7a5dfbf05d35..455a38e8ffc4789dc0b51f9bb8dc5a33c55743c1 100644 (file)
--- a/Topaz_Tools/lib/getpagedim.py
+++ b/Topaz_Tools/lib/getpagedim.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  import csv
  import sys
diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py

index 0f84d696f3197c5887a0895e5a783000960f4a33..73f798fb4460729d69508c6ebcecc28852436331 100644 (file)
--- a/Topaz_Tools/lib/stylexml2css.py
+++ b/Topaz_Tools/lib/stylexml2css.py
@@ -1,6 +1,6 @@
  #! /usr/bin/python
  # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6
  
  import csv
  import sys
@@ -85,7 +85,10 @@ class DocParser(object):
      def process(self):
  
          classlst = ''
-        csspage = ''
+        csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n'
+        csspage += '.cl-right { text-align: right; }\n'
+        csspage += '.cl-left { text-align: left; }\n'
+        csspage += '.cl-justify { text-align: justify; }\n'
  
          # generate a list of each <style> starting point in the stylesheet
          styleList= self.posinDoc('book.stylesheet.style')
@@ -108,6 +111,7 @@ class DocParser(object):
                  # get the style class
                  (pos, sclass) = self.findinDoc('style.class',start,end)
                  if sclass != None:
+                    sclass = sclass.replace(' ','-')
                      sclass = '.cl-' + sclass.lower()
                  else : 
                      sclass = ''
@@ -115,6 +119,7 @@ class DocParser(object):
                  # check for any "after class" specifiers
                  (pos, aftclass) = self.findinDoc('style._after_class',start,end)
                  if aftclass != None:
+                    aftclass = aftclass.replace(' ','-')
                      aftclass = '.cl-' + aftclass.lower()
                  else : 
                      aftclass = ''
@@ -216,7 +221,8 @@ class DocParser(object):
                          if ctype == 'h3_' :
                              csspage += 'h6' + cssline + '\n'
  
-                    csspage += self.stags[tag] + cssline + '\n'
+                    if cssline != ' { }':
+                        csspage += self.stags[tag] + cssline + '\n'
  
                  
          return csspage, classlst
diff --git a/Topaz_Tools/lib/topaz-changes.txt b/Topaz_Tools/lib/topaz-changes.txt

index 055b4b5735986e3f4aae0ff6c9ce30c1b691191c..f493d45a56c657aa6bf0f07b22e6c00975447d79 100644 (file)
--- a/Topaz_Tools/lib/topaz-changes.txt
+++ b/Topaz_Tools/lib/topaz-changes.txt
@@ -1,4 +1,14 @@
-Canges in 2.3
+Changes in 2.6
+       - fix for many additional version tags
+       - fixes to generate better links
+       - fixes to handle external links
+       - now handles new "marker" page .dat files
+       - improved special region handling
+       - properly handle class names with spaces
+       - handle default alignment for synthetic regions
+
+
+Changes in 2.3
         - fix for use with non-latin1 based systems (thank you Tedd)
         - fixes for out of order tokens in xml
  
diff --git a/eReader_Tools/lib/erdr2pml.py b/eReader_Tools/lib/erdr2pml.py

index 4067ff65530b3f8cfc2b4e8113931ad7933540cf..089d0009befc1e2dabe67b938a6896e7d7e85750 100644 (file)
--- a/eReader_Tools/lib/erdr2pml.py
+++ b/eReader_Tools/lib/erdr2pml.py
@@ -53,8 +53,9 @@
  #  0.12 - Fix added to prevent lowercasing of image names when the pml code itself uses a different case in the link name.
  #  0.13 - change to unbuffered stdout for use with gui front ends
  #  0.14 - contributed enhancement to support --make-pmlz switch
+#  0.15 - enabled high-ascii to pml character encoding. DropBook now works on Mac.
  
-__version__='0.14'
+__version__='0.15'
  
  # Import Psyco if available
  try:
@@ -465,17 +466,6 @@ class EreaderProcessor(object):
          data = sect[62:]
          return sanitizeFileName(name), data
  
-    def cleanPML(self,pml):
-        # Update old \b font tag with correct \B bold font tag
-        pml2 = pml.replace('\\b', '\\B')
-        # Convert special characters to proper PML code.  High ASCII start at (\x82, \a130) and go up to (\xff, \a255)
-        for k in xrange(130,256):
-            # a2b_hex takes in a hexidecimal as a string and converts it 
-            # to a binary ascii code that we search and replace for
-            badChar=binascii.a2b_hex('%02x' % k)
-            pml2 = pml2.replace(badChar, '\\a%03d' % k)
-            #end for k
-        return pml2
  
      # def getChapterNamePMLOffsetData(self):
      #     cv = ''
@@ -564,6 +554,14 @@ class EreaderProcessor(object):
  
          return r
  
+def cleanPML(pml):
+       # Convert special characters to proper PML code.  High ASCII start at (\x80, \a128) and go up to (\xff, \a255)
+       pml2 = pml
+       for k in xrange(128,256):
+               badChar = chr(k)
+               pml2 = pml2.replace(badChar, '\\a%03d' % k)
+       return pml2
+
  def convertEreaderToPml(infile, name, cc, outdir):
      if not os.path.exists(outdir):
          os.makedirs(outdir)
@@ -585,7 +583,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
      print "   Extracting pml"
      pml_string = er.getText()
      pmlfilename = bookname + ".pml"
-    file(os.path.join(outdir, pmlfilename),'wb').write(pml_string)
+    file(os.path.join(outdir, pmlfilename),'wb').write(cleanPML(pml_string))
  
      # bkinfo = er.getBookInfo()
      # if bkinfo != '':
@@ -677,7 +675,7 @@ def main(argv=None):
              search_time = end_time - start_time
              print 'elapsed time: %.2f seconds' % (search_time, ) 
              if make_pmlz :
-                print 'output in %s' % zipname
+                print 'output is %s' % zipname
              else :
                  print 'output in %s' % outdir 
              print "done"
author	Apprentice Alf <apprenticealf@gmail.com>
	Tue, 2 Mar 2010 12:46:56 +0000 (12:46 +0000)
committer	Apprentice Alf <apprenticealf@gmail.com>
	Mon, 2 Mar 2015 07:43:31 +0000 (07:43 +0000)
Topaz_Tools/lib/cmbtc_dump.py		patch \| blob \| blame \| history
Topaz_Tools/lib/cmbtc_dump_nonK4PC.py		patch \| blob \| blame \| history
Topaz_Tools/lib/convert2xml.py		patch \| blob \| blame \| history
Topaz_Tools/lib/decode_meta.py		patch \| blob \| blame \| history
Topaz_Tools/lib/flatxml2html.py		patch \| blob \| blame \| history
Topaz_Tools/lib/genhtml.py		patch \| blob \| blame \| history
Topaz_Tools/lib/gensvg.py		patch \| blob \| blame \| history
Topaz_Tools/lib/genxml.py		patch \| blob \| blame \| history
Topaz_Tools/lib/getpagedim.py		patch \| blob \| blame \| history
Topaz_Tools/lib/stylexml2css.py		patch \| blob \| blame \| history
Topaz_Tools/lib/topaz-changes.txt		patch \| blob \| blame \| history
eReader_Tools/lib/erdr2pml.py		patch \| blob \| blame \| history