More fixes for Amazon books, fixing identity checks, started on Topaz.

author Apprentice Harper <apprenticeharper@gmail.com>

Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)

committer Apprentice Harper <apprenticeharper@gmail.com>

Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)
author Apprentice Harper <apprenticeharper@gmail.com>
Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)
committer Apprentice Harper <apprenticeharper@gmail.com>
Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)
diff --git a/DeDRM_plugin/convert2xml.py b/DeDRM_plugin/convert2xml.py

index 3249db5dfe0780b7f19a8e213f2cd1a2f6f47888..abdaeb32b01067af84a7ae189a0e95a862256a8f 100644 (file)
--- a/DeDRM_plugin/convert2xml.py
+++ b/DeDRM_plugin/convert2xml.py
@@ -56,7 +56,7 @@ def readEncodedNumber(file):
              c = file.read(1)
              if (len(c) == 0):
                  return None
-            data = ord(c)
+            data = c[0]
              datax = (datax <<7) + (data & 0x7F)
          data = datax
  
@@ -188,232 +188,232 @@ class PageParser(object):
      # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
  
      token_tags = {
-        'x'            : (1, 'scalar_number', 0, 0),
-        'y'            : (1, 'scalar_number', 0, 0),
-        'h'            : (1, 'scalar_number', 0, 0),
-        'w'            : (1, 'scalar_number', 0, 0),
-        'firstWord'    : (1, 'scalar_number', 0, 0),
-        'lastWord'     : (1, 'scalar_number', 0, 0),
-        'rootID'       : (1, 'scalar_number', 0, 0),
-        'stemID'       : (1, 'scalar_number', 0, 0),
-        'type'         : (1, 'scalar_text', 0, 0),
-
-        'info'            : (0, 'number', 1, 0),
-
-        'info.word'            : (0, 'number', 1, 1),
-        'info.word.ocrText'    : (1, 'text', 0, 0),
-        'info.word.firstGlyph' : (1, 'raw', 0, 0),
-        'info.word.lastGlyph'  : (1, 'raw', 0, 0),
-        'info.word.bl'         : (1, 'raw', 0, 0),
-        'info.word.link_id'    : (1, 'number', 0, 0),
-
-        'glyph'           : (0, 'number', 1, 1),
-        'glyph.x'         : (1, 'number', 0, 0),
-        'glyph.y'         : (1, 'number', 0, 0),
-        'glyph.glyphID'   : (1, 'number', 0, 0),
-
-        'dehyphen'          : (0, 'number', 1, 1),
-        'dehyphen.rootID'   : (1, 'number', 0, 0),
-        'dehyphen.stemID'   : (1, 'number', 0, 0),
-        'dehyphen.stemPage' : (1, 'number', 0, 0),
-        'dehyphen.sh'       : (1, 'number', 0, 0),
-
-        'links'        : (0, 'number', 1, 1),
-        'links.page'   : (1, 'number', 0, 0),
-        'links.rel'    : (1, 'number', 0, 0),
-        'links.row'    : (1, 'number', 0, 0),
-        'links.title'  : (1, 'text', 0, 0),
-        'links.href'   : (1, 'text', 0, 0),
-        'links.type'   : (1, 'text', 0, 0),
-        'links.id'     : (1, 'number', 0, 0),
-
-        'paraCont'          : (0, 'number', 1, 1),
-        'paraCont.rootID'   : (1, 'number', 0, 0),
-        'paraCont.stemID'   : (1, 'number', 0, 0),
-        'paraCont.stemPage' : (1, 'number', 0, 0),
-
-        'paraStems'        : (0, 'number', 1, 1),
-        'paraStems.stemID' : (1, 'number', 0, 0),
-
-        'wordStems'          : (0, 'number', 1, 1),
-        'wordStems.stemID'   : (1, 'number', 0, 0),
-
-        'empty'          : (1, 'snippets', 1, 0),
-
-        'page'           : (1, 'snippets', 1, 0),
-        'page.class'     : (1, 'scalar_text', 0, 0),
-        'page.pageid'    : (1, 'scalar_text', 0, 0),
-        'page.pagelabel' : (1, 'scalar_text', 0, 0),
-        'page.type'      : (1, 'scalar_text', 0, 0),
-        'page.h'         : (1, 'scalar_number', 0, 0),
-        'page.w'         : (1, 'scalar_number', 0, 0),
-        'page.startID' : (1, 'scalar_number', 0, 0),
-
-        'group'           : (1, 'snippets', 1, 0),
-        'group.class'     : (1, 'scalar_text', 0, 0),
-        'group.type'      : (1, 'scalar_text', 0, 0),
-        'group._tag'      : (1, 'scalar_text', 0, 0),
-        'group.orientation': (1, 'scalar_text', 0, 0),
-
-        'region'           : (1, 'snippets', 1, 0),
-        'region.class'     : (1, 'scalar_text', 0, 0),
-        'region.type'      : (1, 'scalar_text', 0, 0),
-        'region.x'         : (1, 'scalar_number', 0, 0),
-        'region.y'         : (1, 'scalar_number', 0, 0),
-        'region.h'         : (1, 'scalar_number', 0, 0),
-        'region.w'         : (1, 'scalar_number', 0, 0),
-        'region.orientation' : (1, 'scalar_text', 0, 0),
-
-        'empty_text_region' : (1, 'snippets', 1, 0),
-
-        'img'                   : (1, 'snippets', 1, 0),
-        'img.x'                 : (1, 'scalar_number', 0, 0),
-        'img.y'                 : (1, 'scalar_number', 0, 0),
-        'img.h'                 : (1, 'scalar_number', 0, 0),
-        'img.w'                 : (1, 'scalar_number', 0, 0),
-        'img.src'               : (1, 'scalar_number', 0, 0),
-        'img.color_src'         : (1, 'scalar_number', 0, 0),
-        'img.gridSize'          : (1, 'scalar_number', 0, 0),
-        'img.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        'img.gridTopCenter'     : (1, 'scalar_number', 0, 0),
-        'img.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
-        'img.gridEndCenter'     : (1, 'scalar_number', 0, 0),
-        'img.image_type'        : (1, 'scalar_number', 0, 0),
-
-        'paragraph'           : (1, 'snippets', 1, 0),
-        'paragraph.class'     : (1, 'scalar_text', 0, 0),
-        'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
-        'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
-        'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
-        'paragraph.gridSize'  : (1, 'scalar_number', 0, 0),
-        'paragraph.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        'paragraph.gridTopCenter'     : (1, 'scalar_number', 0, 0),
-        'paragraph.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
-        'paragraph.gridEndCenter'     : (1, 'scalar_number', 0, 0),
-
-
-        'word_semantic'           : (1, 'snippets', 1, 1),
-        'word_semantic.type'      : (1, 'scalar_text', 0, 0),
-        'word_semantic.class'     : (1, 'scalar_text', 0, 0),
-        'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
-        'word_semantic.lastWord'  : (1, 'scalar_number', 0, 0),
-        'word_semantic.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        'word_semantic.gridTopCenter'     : (1, 'scalar_number', 0, 0),
-        'word_semantic.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
-        'word_semantic.gridEndCenter'     : (1, 'scalar_number', 0, 0),
-
-        'word'            : (1, 'snippets', 1, 0),
-        'word.type'       : (1, 'scalar_text', 0, 0),
-        'word.class'      : (1, 'scalar_text', 0, 0),
-        'word.firstGlyph' : (1, 'scalar_number', 0, 0),
-        'word.lastGlyph'  : (1, 'scalar_number', 0, 0),
-
-        '_span'           : (1, 'snippets', 1, 0),
-        '_span.class'     : (1, 'scalar_text', 0, 0),
-        '_span.firstWord' : (1, 'scalar_number', 0, 0),
-        '_span.lastWord'  : (1, 'scalar_number', 0, 0),
-        '_span.gridSize'  : (1, 'scalar_number', 0, 0),
-        '_span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
-        '_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
-        '_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
-
-        'span'           : (1, 'snippets', 1, 0),
-        'span.firstWord' : (1, 'scalar_number', 0, 0),
-        'span.lastWord'  : (1, 'scalar_number', 0, 0),
-        'span.gridSize'  : (1, 'scalar_number', 0, 0),
-        'span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
-        'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
-        'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
-
-        'extratokens'                   : (1, 'snippets', 1, 0),
-        'extratokens.class'             : (1, 'scalar_text', 0, 0),
-        'extratokens.type'              : (1, 'scalar_text', 0, 0),
-        'extratokens.firstGlyph'        : (1, 'scalar_number', 0, 0),
-        'extratokens.lastGlyph'         : (1, 'scalar_number', 0, 0),
-        'extratokens.gridSize'          : (1, 'scalar_number', 0, 0),
-        'extratokens.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
-        'extratokens.gridTopCenter'     : (1, 'scalar_number', 0, 0),
-        'extratokens.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
-        'extratokens.gridEndCenter'     : (1, 'scalar_number', 0, 0),
-
-        'glyph.h'      : (1, 'number', 0, 0),
-        'glyph.w'      : (1, 'number', 0, 0),
-        'glyph.use'    : (1, 'number', 0, 0),
-        'glyph.vtx'    : (1, 'number', 0, 1),
-        'glyph.len'    : (1, 'number', 0, 1),
-        'glyph.dpi'    : (1, 'number', 0, 0),
-        'vtx'          : (0, 'number', 1, 1),
-        'vtx.x'        : (1, 'number', 0, 0),
-        'vtx.y'        : (1, 'number', 0, 0),
-        'len'          : (0, 'number', 1, 1),
-        'len.n'        : (1, 'number', 0, 0),
-
-        'book'         : (1, 'snippets', 1, 0),
-        'version'      : (1, 'snippets', 1, 0),
-        'version.FlowEdit_1_id'            : (1, 'scalar_text', 0, 0),
-        'version.FlowEdit_1_version'       : (1, 'scalar_text', 0, 0),
-        'version.Schema_id'                : (1, 'scalar_text', 0, 0),
-        'version.Schema_version'           : (1, 'scalar_text', 0, 0),
-        'version.Topaz_version'            : (1, 'scalar_text', 0, 0),
-        'version.WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0),
-        'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
-        'version.ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0),
-        'version.ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0),
-        'version.chapterheaders'           : (1, 'scalar_text', 0, 0),
-        'version.creation_date'            : (1, 'scalar_text', 0, 0),
-        'version.header_footer'            : (1, 'scalar_text', 0, 0),
-        'version.init_from_ocr'            : (1, 'scalar_text', 0, 0),
-        'version.letter_insertion'         : (1, 'scalar_text', 0, 0),
-        'version.xmlinj_convert'           : (1, 'scalar_text', 0, 0),
-        'version.xmlinj_reflow'            : (1, 'scalar_text', 0, 0),
-        'version.xmlinj_transform'         : (1, 'scalar_text', 0, 0),
-        'version.findlists'                : (1, 'scalar_text', 0, 0),
-        'version.page_num'                 : (1, 'scalar_text', 0, 0),
-        'version.page_type'                : (1, 'scalar_text', 0, 0),
-        'version.bad_text'                 : (1, 'scalar_text', 0, 0),
-        'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0),
-        'version.margins'                  : (1, 'scalar_text', 0, 0),
-        'version.staggered_lines'          : (1, 'scalar_text', 0, 0),
-        'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0),
-        'version.toc'                      : (1, 'scalar_text', 0, 0),
-
-        'stylesheet'                : (1, 'snippets', 1, 0),
-        'style'                     : (1, 'snippets', 1, 0),
-        'style._tag'                : (1, 'scalar_text', 0, 0),
-        'style.type'                : (1, 'scalar_text', 0, 0),
-        'style._after_type'         : (1, 'scalar_text', 0, 0),
-        'style._parent_type'        : (1, 'scalar_text', 0, 0),
-        'style._after_parent_type'  : (1, 'scalar_text', 0, 0),
-        'style.class'               : (1, 'scalar_text', 0, 0),
-        'style._after_class'        : (1, 'scalar_text', 0, 0),
-        'rule'                      : (1, 'snippets', 1, 0),
-        'rule.attr'                 : (1, 'scalar_text', 0, 0),
-        'rule.value'                : (1, 'scalar_text', 0, 0),
-
-        'original'      : (0, 'number', 1, 1),
-        'original.pnum' : (1, 'number', 0, 0),
-        'original.pid'  : (1, 'text', 0, 0),
-        'pages'        : (0, 'number', 1, 1),
-        'pages.ref'    : (1, 'number', 0, 0),
-        'pages.id'     : (1, 'number', 0, 0),
-        'startID'      : (0, 'number', 1, 1),
-        'startID.page' : (1, 'number', 0, 0),
-        'startID.id'   : (1, 'number', 0, 0),
-
-        'median_d'          : (1, 'number', 0, 0),
-        'median_h'          : (1, 'number', 0, 0),
-        'median_firsty'     : (1, 'number', 0, 0),
-        'median_lasty'      : (1, 'number', 0, 0),
-
-        'num_footers_maybe' : (1, 'number', 0, 0),
-        'num_footers_yes'   : (1, 'number', 0, 0),
-        'num_headers_maybe' : (1, 'number', 0, 0),
-        'num_headers_yes'   : (1, 'number', 0, 0),
-
-        'tracking'          : (1, 'number', 0, 0),
-        'src'               : (1, 'text', 0, 0),
+        b'x'            : (1, 'scalar_number', 0, 0),
+        b'y'            : (1, 'scalar_number', 0, 0),
+        b'h'            : (1, 'scalar_number', 0, 0),
+        b'w'            : (1, 'scalar_number', 0, 0),
+        b'firstWord'    : (1, 'scalar_number', 0, 0),
+        b'lastWord'     : (1, 'scalar_number', 0, 0),
+        b'rootID'       : (1, 'scalar_number', 0, 0),
+        b'stemID'       : (1, 'scalar_number', 0, 0),
+        b'type'         : (1, 'scalar_text', 0, 0),
+
+        b'info'            : (0, 'number', 1, 0),
+
+        b'info.word'            : (0, 'number', 1, 1),
+        b'info.word.ocrText'    : (1, 'text', 0, 0),
+        b'info.word.firstGlyph' : (1, 'raw', 0, 0),
+        b'info.word.lastGlyph'  : (1, 'raw', 0, 0),
+        b'info.word.bl'         : (1, 'raw', 0, 0),
+        b'info.word.link_id'    : (1, 'number', 0, 0),
+
+        b'glyph'           : (0, 'number', 1, 1),
+        b'glyph.x'         : (1, 'number', 0, 0),
+        b'glyph.y'         : (1, 'number', 0, 0),
+        b'glyph.glyphID'   : (1, 'number', 0, 0),
+
+        b'dehyphen'          : (0, 'number', 1, 1),
+        b'dehyphen.rootID'   : (1, 'number', 0, 0),
+        b'dehyphen.stemID'   : (1, 'number', 0, 0),
+        b'dehyphen.stemPage' : (1, 'number', 0, 0),
+        b'dehyphen.sh'       : (1, 'number', 0, 0),
+
+        b'links'        : (0, 'number', 1, 1),
+        b'links.page'   : (1, 'number', 0, 0),
+        b'links.rel'    : (1, 'number', 0, 0),
+        b'links.row'    : (1, 'number', 0, 0),
+        b'links.title'  : (1, 'text', 0, 0),
+        b'links.href'   : (1, 'text', 0, 0),
+        b'links.type'   : (1, 'text', 0, 0),
+        b'links.id'     : (1, 'number', 0, 0),
+
+        b'paraCont'          : (0, 'number', 1, 1),
+        b'paraCont.rootID'   : (1, 'number', 0, 0),
+        b'paraCont.stemID'   : (1, 'number', 0, 0),
+        b'paraCont.stemPage' : (1, 'number', 0, 0),
+
+        b'paraStems'        : (0, 'number', 1, 1),
+        b'paraStems.stemID' : (1, 'number', 0, 0),
+
+        b'wordStems'          : (0, 'number', 1, 1),
+        b'wordStems.stemID'   : (1, 'number', 0, 0),
+
+        b'empty'          : (1, 'snippets', 1, 0),
+
+        b'page'           : (1, 'snippets', 1, 0),
+        b'page.class'     : (1, 'scalar_text', 0, 0),
+        b'page.pageid'    : (1, 'scalar_text', 0, 0),
+        b'page.pagelabel' : (1, 'scalar_text', 0, 0),
+        b'page.type'      : (1, 'scalar_text', 0, 0),
+        b'page.h'         : (1, 'scalar_number', 0, 0),
+        b'page.w'         : (1, 'scalar_number', 0, 0),
+        b'page.startID' : (1, 'scalar_number', 0, 0),
+
+        b'group'           : (1, 'snippets', 1, 0),
+        b'group.class'     : (1, 'scalar_text', 0, 0),
+        b'group.type'      : (1, 'scalar_text', 0, 0),
+        b'group._tag'      : (1, 'scalar_text', 0, 0),
+        b'group.orientation': (1, 'scalar_text', 0, 0),
+
+        b'region'           : (1, 'snippets', 1, 0),
+        b'region.class'     : (1, 'scalar_text', 0, 0),
+        b'region.type'      : (1, 'scalar_text', 0, 0),
+        b'region.x'         : (1, 'scalar_number', 0, 0),
+        b'region.y'         : (1, 'scalar_number', 0, 0),
+        b'region.h'         : (1, 'scalar_number', 0, 0),
+        b'region.w'         : (1, 'scalar_number', 0, 0),
+        b'region.orientation' : (1, 'scalar_text', 0, 0),
+
+        b'empty_text_region' : (1, 'snippets', 1, 0),
+
+        b'img'                   : (1, 'snippets', 1, 0),
+        b'img.x'                 : (1, 'scalar_number', 0, 0),
+        b'img.y'                 : (1, 'scalar_number', 0, 0),
+        b'img.h'                 : (1, 'scalar_number', 0, 0),
+        b'img.w'                 : (1, 'scalar_number', 0, 0),
+        b'img.src'               : (1, 'scalar_number', 0, 0),
+        b'img.color_src'         : (1, 'scalar_number', 0, 0),
+        b'img.gridSize'          : (1, 'scalar_number', 0, 0),
+        b'img.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'img.gridTopCenter'     : (1, 'scalar_number', 0, 0),
+        b'img.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
+        b'img.gridEndCenter'     : (1, 'scalar_number', 0, 0),
+        b'img.image_type'        : (1, 'scalar_number', 0, 0),
+
+        b'paragraph'           : (1, 'snippets', 1, 0),
+        b'paragraph.class'     : (1, 'scalar_text', 0, 0),
+        b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
+        b'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
+        b'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
+        b'paragraph.gridSize'  : (1, 'scalar_number', 0, 0),
+        b'paragraph.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'paragraph.gridTopCenter'     : (1, 'scalar_number', 0, 0),
+        b'paragraph.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
+        b'paragraph.gridEndCenter'     : (1, 'scalar_number', 0, 0),
+
+
+        b'word_semantic'           : (1, 'snippets', 1, 1),
+        b'word_semantic.type'      : (1, 'scalar_text', 0, 0),
+        b'word_semantic.class'     : (1, 'scalar_text', 0, 0),
+        b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
+        b'word_semantic.lastWord'  : (1, 'scalar_number', 0, 0),
+        b'word_semantic.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'word_semantic.gridTopCenter'     : (1, 'scalar_number', 0, 0),
+        b'word_semantic.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
+        b'word_semantic.gridEndCenter'     : (1, 'scalar_number', 0, 0),
+
+        b'word'            : (1, 'snippets', 1, 0),
+        b'word.type'       : (1, 'scalar_text', 0, 0),
+        b'word.class'      : (1, 'scalar_text', 0, 0),
+        b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
+        b'word.lastGlyph'  : (1, 'scalar_number', 0, 0),
+
+        b'_span'           : (1, 'snippets', 1, 0),
+        b'_span.class'     : (1, 'scalar_text', 0, 0),
+        b'_span.firstWord' : (1, 'scalar_number', 0, 0),
+        b'_span.lastWord'  : (1, 'scalar_number', 0, 0),
+        b'_span.gridSize'  : (1, 'scalar_number', 0, 0),
+        b'_span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
+        b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+        b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
+
+        b'span'           : (1, 'snippets', 1, 0),
+        b'span.firstWord' : (1, 'scalar_number', 0, 0),
+        b'span.lastWord'  : (1, 'scalar_number', 0, 0),
+        b'span.gridSize'  : (1, 'scalar_number', 0, 0),
+        b'span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
+        b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
+        b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
+
+        b'extratokens'                   : (1, 'snippets', 1, 0),
+        b'extratokens.class'             : (1, 'scalar_text', 0, 0),
+        b'extratokens.type'              : (1, 'scalar_text', 0, 0),
+        b'extratokens.firstGlyph'        : (1, 'scalar_number', 0, 0),
+        b'extratokens.lastGlyph'         : (1, 'scalar_number', 0, 0),
+        b'extratokens.gridSize'          : (1, 'scalar_number', 0, 0),
+        b'extratokens.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        b'extratokens.gridTopCenter'     : (1, 'scalar_number', 0, 0),
+        b'extratokens.gridBeginCenter'   : (1, 'scalar_number', 0, 0),
+        b'extratokens.gridEndCenter'     : (1, 'scalar_number', 0, 0),
+
+        b'glyph.h'      : (1, 'number', 0, 0),
+        b'glyph.w'      : (1, 'number', 0, 0),
+        b'glyph.use'    : (1, 'number', 0, 0),
+        b'glyph.vtx'    : (1, 'number', 0, 1),
+        b'glyph.len'    : (1, 'number', 0, 1),
+        b'glyph.dpi'    : (1, 'number', 0, 0),
+        b'vtx'          : (0, 'number', 1, 1),
+        b'vtx.x'        : (1, 'number', 0, 0),
+        b'vtx.y'        : (1, 'number', 0, 0),
+        b'len'          : (0, 'number', 1, 1),
+        b'len.n'        : (1, 'number', 0, 0),
+
+        b'book'         : (1, 'snippets', 1, 0),
+        b'version'      : (1, 'snippets', 1, 0),
+        b'version.FlowEdit_1_id'            : (1, 'scalar_text', 0, 0),
+        b'version.FlowEdit_1_version'       : (1, 'scalar_text', 0, 0),
+        b'version.Schema_id'                : (1, 'scalar_text', 0, 0),
+        b'version.Schema_version'           : (1, 'scalar_text', 0, 0),
+        b'version.Topaz_version'            : (1, 'scalar_text', 0, 0),
+        b'version.WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0),
+        b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
+        b'version.ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0),
+        b'version.ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0),
+        b'version.chapterheaders'           : (1, 'scalar_text', 0, 0),
+        b'version.creation_date'            : (1, 'scalar_text', 0, 0),
+        b'version.header_footer'            : (1, 'scalar_text', 0, 0),
+        b'version.init_from_ocr'            : (1, 'scalar_text', 0, 0),
+        b'version.letter_insertion'         : (1, 'scalar_text', 0, 0),
+        b'version.xmlinj_convert'           : (1, 'scalar_text', 0, 0),
+        b'version.xmlinj_reflow'            : (1, 'scalar_text', 0, 0),
+        b'version.xmlinj_transform'         : (1, 'scalar_text', 0, 0),
+        b'version.findlists'                : (1, 'scalar_text', 0, 0),
+        b'version.page_num'                 : (1, 'scalar_text', 0, 0),
+        b'version.page_type'                : (1, 'scalar_text', 0, 0),
+        b'version.bad_text'                 : (1, 'scalar_text', 0, 0),
+        b'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0),
+        b'version.margins'                  : (1, 'scalar_text', 0, 0),
+        b'version.staggered_lines'          : (1, 'scalar_text', 0, 0),
+        b'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0),
+        b'version.toc'                      : (1, 'scalar_text', 0, 0),
+
+        b'stylesheet'                : (1, 'snippets', 1, 0),
+        b'style'                     : (1, 'snippets', 1, 0),
+        b'style._tag'                : (1, 'scalar_text', 0, 0),
+        b'style.type'                : (1, 'scalar_text', 0, 0),
+        b'style._after_type'         : (1, 'scalar_text', 0, 0),
+        b'style._parent_type'        : (1, 'scalar_text', 0, 0),
+        b'style._after_parent_type'  : (1, 'scalar_text', 0, 0),
+        b'style.class'               : (1, 'scalar_text', 0, 0),
+        b'style._after_class'        : (1, 'scalar_text', 0, 0),
+        b'rule'                      : (1, 'snippets', 1, 0),
+        b'rule.attr'                 : (1, 'scalar_text', 0, 0),
+        b'rule.value'                : (1, 'scalar_text', 0, 0),
+
+        b'original'      : (0, 'number', 1, 1),
+        b'original.pnum' : (1, 'number', 0, 0),
+        b'original.pid'  : (1, 'text', 0, 0),
+        b'pages'        : (0, 'number', 1, 1),
+        b'pages.ref'    : (1, 'number', 0, 0),
+        b'pages.id'     : (1, 'number', 0, 0),
+        b'startID'      : (0, 'number', 1, 1),
+        b'startID.page' : (1, 'number', 0, 0),
+        b'startID.id'   : (1, 'number', 0, 0),
+
+        b'median_d'          : (1, 'number', 0, 0),
+        b'median_h'          : (1, 'number', 0, 0),
+        b'median_firsty'     : (1, 'number', 0, 0),
+        b'median_lasty'      : (1, 'number', 0, 0),
+
+        b'num_footers_maybe' : (1, 'number', 0, 0),
+        b'num_footers_yes'   : (1, 'number', 0, 0),
+        b'num_headers_maybe' : (1, 'number', 0, 0),
+        b'num_headers_yes'   : (1, 'number', 0, 0),
+
+        b'tracking'          : (1, 'number', 0, 0),
+        b'src'               : (1, 'text', 0, 0),
  
       }
  
@@ -430,7 +430,7 @@ class PageParser(object):
          cnt = len(self.tagpath)
          if i < cnt : result = self.tagpath[i]
          for j in range(i+1, cnt) :
-            result += '.' + self.tagpath[j]
+            result += b'.' + self.tagpath[j]
          return result
  
  
@@ -505,7 +505,7 @@ class PageParser(object):
  
              if (subtags == 1):
                  ntags = readEncodedNumber(self.fo)
-                if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
+                if self.debug : print('subtags: ', token , ' has ' , str(ntags))
                  for j in range(ntags):
                      val = readEncodedNumber(self.fo)
                      subtagres.append(self.procToken(self.dict.lookup(val)))
@@ -613,7 +613,7 @@ class PageParser(object):
          subtagList = tag[1]
          argtype = tag[2]
          argList = tag[3]
-        nname = prefix + '.' + name
+        nname = prefix + b'.' + name
          nsubtaglist = []
          for j in subtagList:
              nsubtaglist.append(self.updateName(j,prefix))
@@ -662,34 +662,34 @@ class PageParser(object):
          subtagList = node[1]
          argtype = node[2]
          argList = node[3]
-        fullpathname = name.split('.')
+        fullpathname = name.split(b'.')
          nodename = fullpathname.pop()
          ilvl = len(fullpathname)
-        indent = ' ' * (3 * ilvl)
+        indent = b' ' * (3 * ilvl)
          rlst = []
-        rlst.append(indent + '<' + nodename + '>')
+        rlst.append(indent + b'<' + nodename + b'>')
          if len(argList) > 0:
              alst = []
              for j in argList:
-                if (argtype == 'text') or (argtype == 'scalar_text') :
-                    alst.append(j + '|')
+                if (argtype == b'text') or (argtype == b'scalar_text') :
+                    alst.append(j + b'|')
                  else :
-                    alst.append(str(j) + ',')
-            argres = "".join(alst)
+                    alst.append(str(j).encode('utf-8') + b',')
+            argres = b"".join(alst)
              argres = argres[0:-1]
-            if argtype == 'snippets' :
-                rlst.append('snippets:' + argres)
+            if argtype == b'snippets' :
+                rlst.append(b'snippets:' + argres)
              else :
                  rlst.append(argres)
          if len(subtagList) > 0 :
-            rlst.append('\n')
+            rlst.append(b'\n')
              for j in subtagList:
                  if len(j) > 0 :
                      rlst.append(self.formatTag(j))
-            rlst.append(indent + '</' + nodename + '>\n')
+            rlst.append(indent + b'</' + nodename + b'>\n')
          else:
-            rlst.append('</' + nodename + '>\n')
-        return "".join(rlst)
+            rlst.append(b'</' + nodename + b'>\n')
+        return b"".join(rlst)
  
  
      # flatten tag
@@ -704,20 +704,20 @@ class PageParser(object):
              alst = []
              for j in argList:
                  if (argtype == 'text') or (argtype == 'scalar_text') :
-                    alst.append(j + '|')
+                     alst.append(j + b'|')
                  else :
-                    alst.append(str(j) + '|')
-            argres = "".join(alst)
+                    alst.append(str(j).encode('utf-8') + b'|')
+            argres = b"".join(alst)
              argres = argres[0:-1]
-            if argtype == 'snippets' :
-                rlst.append('.snippets=' + argres)
+            if argtype == b'snippets' :
+                rlst.append(b'.snippets=' + argres)
              else :
-                rlst.append('=' + argres)
-        rlst.append('\n')
+                rlst.append(b'=' + argres)
+        rlst.append(b'\n')
          for j in subtagList:
              if len(j) > 0 :
                  rlst.append(self.flattenTag(j))
-        return "".join(rlst)
+        return b"".join(rlst)
  
  
      # reduce create xml output
@@ -729,7 +729,7 @@ class PageParser(object):
                      rlst.append(self.flattenTag(j))
                  else:
                      rlst.append(self.formatTag(j))
-        result = "".join(rlst)
+        result = b"".join(rlst)
          if self.debug : print(result)
          return result
  
@@ -747,16 +747,16 @@ class PageParser(object):
  
          # peek at the first bytes to see what type of file it is
          magic = self.fo.read(9)
-        if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
-            first_token = 'info'
-        elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
+        if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
+            first_token = b'info'
+        elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
              skip = self.fo.read(2)
-            first_token = 'info'
-        elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
-            first_token = 'info'
-        elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
+            first_token = b'info'
+        elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
+            first_token = b'info'
+        elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
              skip = self.fo.read(3)
-            first_token = 'info'
+            first_token = b'info'
          else :
              # other0.dat file
              first_token = None
@@ -778,7 +778,7 @@ class PageParser(object):
                  break
  
              if (v == 0x72):
-                self.doLoop72('number')
+                self.doLoop72(b'number')
              elif (v > 0) and (v < self.dict.getSize()) :
                  tag = self.procToken(self.dict.lookup(v))
                  if len(tag) > 0 :
@@ -789,7 +789,7 @@ class PageParser(object):
                  if (v == 0):
                      if (self.peek(1) == 0x5f):
                          skip = self.fo.read(1)
-                        first_token = 'info'
+                        first_token = b'info'
  
          # now do snippet injection
          if len(self.snippetList) > 0 :
@@ -809,14 +809,14 @@ class PageParser(object):
  
  def fromData(dict, fname):
      flat_xml = True
-    debug = False
+    debug = True
      pp = PageParser(fname, dict, debug, flat_xml)
      xmlpage = pp.process()
      return xmlpage
  
  def getXML(dict, fname):
      flat_xml = False
-    debug = False
+    debug = True
      pp = PageParser(fname, dict, debug, flat_xml)
      xmlpage = pp.process()
      return xmlpage
@@ -845,7 +845,7 @@ def main(argv):
      sys.stderr=SafeUnbuffered(sys.stderr)
      dictFile = ""
      pageFile = ""
-    debug = False
+    debug = True
      flat_xml = False
      printOutput = False
      if len(argv) == 0:
diff --git a/DeDRM_plugin/flatxml2html.py b/DeDRM_plugin/flatxml2html.py

index 6f839ce29d885faedbda8ef454720ce80d1cbffe..f1ca81dc73c489feb8aee870bdc17852044df12e 100644 (file)
--- a/DeDRM_plugin/flatxml2html.py
+++ b/DeDRM_plugin/flatxml2html.py
@@ -7,6 +7,7 @@ import csv
  import os
  import math
  import getopt
+import functools
  from struct import pack
  from struct import unpack
  
@@ -15,14 +16,14 @@ class DocParser(object):
      def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
          self.id = os.path.basename(fileid).replace('.dat','')
          self.svgcount = 0
-        self.docList = flatxml.split('\n')
+        self.docList = flatxml.split(b'\n')
          self.docSize = len(self.docList)
          self.classList = {}
          self.bookDir = bookDir
          self.gdict = gdict
          tmpList = classlst.split('\n')
          for pclass in tmpList:
-            if pclass != '':
+            if pclass != b'':
                  # remove the leading period from the css name
                  cname = pclass[1:]
              self.classList[cname] = True
@@ -57,9 +58,9 @@ class DocParser(object):
          imgfile = os.path.join(imgDir,imgname)
  
          # get glyph information
-        gxList = self.getData('info.glyph.x',0,-1)
-        gyList = self.getData('info.glyph.y',0,-1)
-        gidList = self.getData('info.glyph.glyphID',0,-1)
+        gxList = self.getData(b'info.glyph.x',0,-1)
+        gyList = self.getData(b'info.glyph.y',0,-1)
+        gidList = self.getData(b'info.glyph.glyphID',0,-1)
  
          gids = []
          maxws = []
@@ -122,11 +123,11 @@ class DocParser(object):
      def lineinDoc(self, pos) :
          if (pos >= 0) and (pos < self.docSize) :
              item = self.docList[pos]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
              else :
                  name = item
-                argres = ''
+                argres = b''
          return name, argres
  
  
@@ -140,11 +141,13 @@ class DocParser(object):
          foundat = -1
          for j in range(pos, end):
              item = self.docList[j]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
              else :
                  name = item
                  argres = ''
+            if (isinstance(tagpath,str)):
+                tagpath = tagpath.encode('utf-8')
              if name.endswith(tagpath) :
                  result = argres
                  foundat = j
@@ -170,7 +173,7 @@ class DocParser(object):
          argres=[]
          (foundat, argt) = self.findinDoc(tagpath, pos, end)
          if (argt != None) and (len(argt) > 0) :
-            argList = argt.split('|')
+            argList = argt.split(b'|')
              argres = [ int(strval) for strval in argList]
          return argres
  
@@ -191,21 +194,21 @@ class DocParser(object):
  
          # also some class names have spaces in them so need to convert to dashes
          if nclass != None :
-            nclass = nclass.replace(' ','-')
-            classres = ''
+            nclass = nclass.replace(b' ',b'-')
+            classres = b''
              nclass = nclass.lower()
-            nclass = 'cl-' + nclass
-            baseclass = ''
+            nclass = b'cl-' + nclass
+            baseclass = b''
              # graphic is the base class for captions
-            if nclass.find('cl-cap-') >=0 :
-                classres = 'graphic' + ' '
+            if nclass.find(b'cl-cap-') >=0 :
+                classres = b'graphic' + b' '
              else :
                  # strip to find baseclass
-                p = nclass.find('_')
+                p = nclass.find(b'_')
                  if p > 0 :
                      baseclass = nclass[0:p]
                      if baseclass in self.classList:
-                        classres += baseclass + ' '
+                        classres += baseclass + b' '
              classres += nclass
              nclass = classres
          return nclass
@@ -225,11 +228,11 @@ class DocParser(object):
              return -1
  
          result = []
-        (pos, pagetype) = self.findinDoc('page.type',0,-1)
+        (pos, pagetype) = self.findinDoc(b'page.type',0,-1)
  
-        groupList = self.posinDoc('page.group')
-        groupregionList = self.posinDoc('page.group.region')
-        pageregionList = self.posinDoc('page.region')
+        groupList = self.posinDoc(b'page.group')
+        groupregionList = self.posinDoc(b'page.group.region')
+        pageregionList = self.posinDoc(b'page.region')
          # integrate into one list
          for j in groupList:
              result.append(('grpbeg',j))
@@ -237,7 +240,7 @@ class DocParser(object):
              result.append(('gregion',j))
          for j in pageregionList:
              result.append(('pregion',j))
-        result.sort(compare)
+        result.sort(key=functools.cmp_to_key(compare))
  
          # insert group end and page end indicators
          inGroup = False
@@ -267,33 +270,33 @@ class DocParser(object):
          result = []
  
          # paragraph
-        (pos, pclass) = self.findinDoc('paragraph.class',start,end)
+        (pos, pclass) = self.findinDoc(b'paragraph.class',start,end)
  
          pclass = self.getClass(pclass)
  
          # if paragraph uses extratokens (extra glyphs) then make it fixed
-        (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+        (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)
  
          # build up a description of the paragraph in result and return it
          # first check for the  basic - all words paragraph
-        (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
-        (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+        (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
+        (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
          if (sfirst != None) and (slast != None) :
              first = int(sfirst)
              last = int(slast)
  
-            makeImage = (regtype == 'vertical') or (regtype == 'table')
+            makeImage = (regtype == b'vertical') or (regtype == b'table')
              makeImage = makeImage or (extraglyphs != None)
              if self.fixedimage:
-                makeImage = makeImage or (regtype == 'fixed')
+                makeImage = makeImage or (regtype == b'fixed')
  
              if (pclass != None):
-                makeImage = makeImage or (pclass.find('.inverted') >= 0)
+                makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
                  if self.fixedimage :
-                    makeImage = makeImage or (pclass.find('cl-f-') >= 0)
+                    makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)
  
              # before creating an image make sure glyph info exists
-            gidList = self.getData('info.glyph.glyphID',0,-1)
+            gidList = self.getData(b'info.glyph.glyphID',0,-1)
  
              makeImage = makeImage & (len(gidList) > 0)
  
@@ -307,8 +310,8 @@ class DocParser(object):
              # translate first and last word into first and last glyphs
              # and generate inline image and include it
              glyphList = []
-            firstglyphList = self.getData('word.firstGlyph',0,-1)
-            gidList = self.getData('info.glyph.glyphID',0,-1)
+            firstglyphList = self.getData(b'word.firstGlyph',0,-1)
+            gidList = self.getData(b'info.glyph.glyphID',0,-1)
              firstGlyph = firstglyphList[first]
              if last < len(firstglyphList):
                  lastGlyph = firstglyphList[last]
@@ -326,8 +329,8 @@ class DocParser(object):
              for glyphnum in range(firstGlyph, lastGlyph):
                  glyphList.append(glyphnum)
              # include any extratokens if they exist
-            (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
-            (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
+            (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
+            (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
              if (sfg != None) and (slg != None):
                  for glyphnum in range(int(sfg), int(slg)):
                      glyphList.append(glyphnum)
@@ -368,39 +371,39 @@ class DocParser(object):
  
              (name, argres) = self.lineinDoc(line)
  
-            if name.endswith('span.firstWord') :
+            if name.endswith(b'span.firstWord') :
                  sp_first = int(argres)
  
-            elif name.endswith('span.lastWord') :
+            elif name.endswith(b'span.lastWord') :
                  sp_last = int(argres)
  
-            elif name.endswith('word.firstGlyph') :
+            elif name.endswith(b'word.firstGlyph') :
                  gl_first = int(argres)
  
-            elif name.endswith('word.lastGlyph') :
+            elif name.endswith(b'word.lastGlyph') :
                  gl_last = int(argres)
  
-            elif name.endswith('word_semantic.firstWord'):
+            elif name.endswith(b'word_semantic.firstWord'):
                  ws_first = int(argres)
  
-            elif name.endswith('word_semantic.lastWord'):
+            elif name.endswith(b'word_semantic.lastWord'):
                  ws_last = int(argres)
  
-            elif name.endswith('word.class'):
+            elif name.endswith(b'word.class'):
                  # we only handle spaceafter word class
                  try:
-                    (cname, space) = argres.split('-',1)
-                    if space == '' : space = '0'
-                    if (cname == 'spaceafter') and (int(space) > 0) :
+                    (cname, space) = argres.split(b'-',1)
+                    if space == b'' : space = b'0'
+                    if (cname == b'spaceafter') and (int(space) > 0) :
                          word_class = 'sa'
                  except:
                      pass
  
-            elif name.endswith('word.img.src'):
+            elif name.endswith(b'word.img.src'):
                  result.append(('img' + word_class, int(argres)))
                  word_class = ''
  
-            elif name.endswith('region.img.src'):
+            elif name.endswith(b'region.img.src'):
                  result.append(('img' + word_class, int(argres)))
  
              if (sp_first != -1) and (sp_last != -1):
@@ -437,7 +440,7 @@ class DocParser(object):
  
          classres = ''
          if pclass :
-            classres = ' class="' + pclass + '"'
+            classres = ' class="' + pclass.decode('utf-8') + '"'
  
          br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
  
@@ -470,8 +473,8 @@ class DocParser(object):
                      if (link > 0):
                          linktype = self.link_type[link-1]
                          title = self.link_title[link-1]
-                        if (title == "") or (parares.rfind(title) < 0):
-                            title=parares[lstart:]
+                        if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
+                            title=parares[lstart:].encode('utf-8')
                          if linktype == 'external' :
                              linkhref = self.link_href[link-1]
                              linkhtml = '<a href="%s">' % linkhref
@@ -482,33 +485,34 @@ class DocParser(object):
                              else :
                                  # just link to the current page
                                  linkhtml = '<a href="#' + self.id + '">'
-                        linkhtml += title + '</a>'
-                        pos = parares.rfind(title)
+                        linkhtml += title.decode('utf-8')
+                        linkhtml += '</a>'
+                        pos = parares.rfind(title.decode('utf-8'))
                          if pos >= 0:
                              parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
                          else :
                              parares += linkhtml
                          lstart = len(parares)
-                        if word == '_link_' : word = ''
+                        if word == b'_link_' : word = b''
                      elif (link < 0) :
-                        if word == '_link_' : word = ''
+                        if word == b'_link_' : word = b''
  
-                if word == '_lb_':
+                if word == b'_lb_':
                      if ((num-1) in self.dehyphen_rootid ) or handle_links:
-                        word = ''
+                        word = b''
                          sep = ''
                      elif br_lb :
-                        word = '<br />\n'
+                        word = b'<br />\n'
                          sep = ''
                      else :
-                        word = '\n'
+                        word = b'\n'
                          sep = ''
  
                  if num in self.dehyphen_rootid :
                      word = word[0:-1]
                      sep = ''
  
-                parares += word + sep
+                parares += word.decode('utf-8') + sep
  
              elif wtype == 'img' :
                  sep = ''
@@ -522,7 +526,9 @@ class DocParser(object):
  
              elif wtype == 'svg' :
                  sep = ''
-                parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
+                parares += '<img src="img/'
+                parares += self.id
+                parares += '_%04d.svg" alt="" />' % num
                  parares += sep
  
          if len(sep) > 0 : parares = parares[0:-1]
@@ -545,7 +551,7 @@ class DocParser(object):
              (wtype, num) = pdesc[j]
  
              if wtype == 'ocr' :
-                word = self.ocrtext[num]
+                word = self.ocrtext[num].decode('utf-8')
                  sep = ' '
  
                  if handle_links:
@@ -553,7 +559,7 @@ class DocParser(object):
                      if (link > 0):
                          linktype = self.link_type[link-1]
                          title = self.link_title[link-1]
-                        title = title.rstrip('. ')
+                        title = title.rstrip(b'. ')
                          alt_title = parares[lstart:]
                          alt_title = alt_title.strip()
                          # now strip off the actual printed page number
@@ -607,38 +613,38 @@ class DocParser(object):
          hlst = []
  
          # get the ocr text
-        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
-        if argres :  self.ocrtext = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
+        if argres :  self.ocrtext = argres.split(b'|')
  
          # get information to dehyphenate the text
-        self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
+        self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)
  
          # determine if first paragraph is continued from previous page
-        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
+        (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
          first_para_continued = (self.parastems_stemid  != None)
  
          # determine if last paragraph is continued onto the next page
-        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
+        (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
          last_para_continued = (self.paracont_stemid != None)
  
          # collect link ids
-        self.link_id = self.getData('info.word.link_id',0,-1)
+        self.link_id = self.getData(b'info.word.link_id',0,-1)
  
          # collect link destination page numbers
-        self.link_page = self.getData('info.links.page',0,-1)
+        self.link_page = self.getData(b'info.links.page',0,-1)
  
          # collect link types (container versus external)
-        (pos, argres) = self.findinDoc('info.links.type',0,-1)
-        if argres :  self.link_type = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.links.type',0,-1)
+        if argres :  self.link_type = argres.split(b'|')
  
          # collect link destinations
-        (pos, argres) = self.findinDoc('info.links.href',0,-1)
-        if argres :  self.link_href = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.links.href',0,-1)
+        if argres :  self.link_href = argres.split(b'|')
  
          # collect link titles
-        (pos, argres) = self.findinDoc('info.links.title',0,-1)
+        (pos, argres) = self.findinDoc(b'info.links.title',0,-1)
          if argres :
-            self.link_title = argres.split('|')
+            self.link_title = argres.split(b'|')
          else:
              self.link_title.append('')
  
@@ -662,51 +668,51 @@ class DocParser(object):
              # set anchor for link target on this page
              if not anchorSet and not first_para_continued:
                  hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
-                hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n')
+                hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
                  anchorSet = True
  
              # handle groups of graphics with text captions
-            if (etype == 'grpbeg'):
-                (pos, grptype) = self.findinDoc('group.type', start, end)
+            if (etype == b'grpbeg'):
+                (pos, grptype) = self.findinDoc(b'group.type', start, end)
                  if grptype != None:
-                    if grptype == 'graphic':
-                        gcstr = ' class="' + grptype + '"'
+                    if grptype == b'graphic':
+                        gcstr = ' class="' + grptype.decode('utf-8') + '"'
                          hlst.append('<div' + gcstr + '>')
                          inGroup = True
  
-            elif (etype == 'grpend'):
+            elif (etype == b'grpend'):
                  if inGroup:
                      hlst.append('</div>\n')
                      inGroup = False
  
              else:
-                (pos, regtype) = self.findinDoc('region.type',start,end)
+                (pos, regtype) = self.findinDoc(b'region.type',start,end)
  
-                if regtype == 'graphic' :
-                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                if regtype == b'graphic' :
+                    (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                      if simgsrc:
                          if inGroup:
                              hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
                          else:
                              hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
  
-                elif regtype == 'chapterheading' :
+                elif regtype == b'chapterheading' :
                      (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                      if not breakSet:
                          hlst.append('<div style="page-break-after: always;">&nbsp;</div>\n')
                          breakSet = True
                      tag = 'h1'
                      if pclass and (len(pclass) >= 7):
-                        if pclass[3:7] == 'ch1-' : tag = 'h1'
-                        if pclass[3:7] == 'ch2-' : tag = 'h2'
-                        if pclass[3:7] == 'ch3-' : tag = 'h3'
-                        hlst.append('<' + tag + ' class="' + pclass + '">')
+                        if pclass[3:7] == b'ch1-' : tag = 'h1'
+                        if pclass[3:7] == b'ch2-' : tag = 'h2'
+                        if pclass[3:7] == b'ch3-' : tag = 'h3'
+                        hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                      else:
                          hlst.append('<' + tag + '>')
                      hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                      hlst.append('</' + tag + '>')
  
-                elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
+                elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
                      ptype = 'full'
                      # check to see if this is a continution from the previous page
                      if first_para_continued :
@@ -715,16 +721,16 @@ class DocParser(object):
                      (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                      if pclass and (len(pclass) >= 6) and (ptype == 'full'):
                          tag = 'p'
-                        if pclass[3:6] == 'h1-' : tag = 'h4'
-                        if pclass[3:6] == 'h2-' : tag = 'h5'
-                        if pclass[3:6] == 'h3-' : tag = 'h6'
-                        hlst.append('<' + tag + ' class="' + pclass + '">')
+                        if pclass[3:6] == b'h1-' : tag = 'h4'
+                        if pclass[3:6] == b'h2-' : tag = 'h5'
+                        if pclass[3:6] == b'h3-' : tag = 'h6'
+                        hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                          hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                          hlst.append('</' + tag + '>')
                      else :
                          hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
  
-                elif (regtype == 'tocentry') :
+                elif (regtype == b'tocentry') :
                      ptype = 'full'
                      if first_para_continued :
                          ptype = 'end'
@@ -733,7 +739,7 @@ class DocParser(object):
                      tocinfo += self.buildTOCEntry(pdesc)
                      hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
  
-                elif (regtype == 'vertical') or (regtype == 'table') :
+                elif (regtype == b'vertical') or (regtype == b'table') :
                      ptype = 'full'
                      if inGroup:
                          ptype = 'middle'
@@ -744,19 +750,19 @@ class DocParser(object):
                      hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
  
  
-                elif (regtype == 'synth_fcvr.center'):
-                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                elif (regtype == b'synth_fcvr.center'):
+                    (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                      if simgsrc:
                          hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
  
                  else :
                      print('          Making region type', regtype, end=' ')
-                    (pos, temp) = self.findinDoc('paragraph',start,end)
-                    (pos2, temp) = self.findinDoc('span',start,end)
+                    (pos, temp) = self.findinDoc(b'paragraph',start,end)
+                    (pos2, temp) = self.findinDoc(b'span',start,end)
                      if pos != -1 or pos2 != -1:
                          print(' a "text" region')
                          orig_regtype = regtype
-                        regtype = 'fixed'
+                        regtype = b'fixed'
                          ptype = 'full'
                          # check to see if this is a continution from the previous page
                          if first_para_continued :
@@ -764,23 +770,23 @@ class DocParser(object):
                              first_para_continued = False
                          (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                          if not pclass:
-                            if orig_regtype.endswith('.right')     : pclass = 'cl-right'
-                            elif orig_regtype.endswith('.center')  : pclass = 'cl-center'
-                            elif orig_regtype.endswith('.left')    : pclass = 'cl-left'
-                            elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
+                            if orig_regtype.endswith(b'.right')     : pclass = 'cl-right'
+                            elif orig_regtype.endswith(b'.center')  : pclass = 'cl-center'
+                            elif orig_regtype.endswith(b'.left')    : pclass = 'cl-left'
+                            elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
                          if pclass and (ptype == 'full') and (len(pclass) >= 6):
                              tag = 'p'
-                            if pclass[3:6] == 'h1-' : tag = 'h4'
-                            if pclass[3:6] == 'h2-' : tag = 'h5'
-                            if pclass[3:6] == 'h3-' : tag = 'h6'
-                            hlst.append('<' + tag + ' class="' + pclass + '">')
+                            if pclass[3:6] == b'h1-' : tag = 'h4'
+                            if pclass[3:6] == b'h2-' : tag = 'h5'
+                            if pclass[3:6] == b'h3-' : tag = 'h6'
+                            hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                              hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                              hlst.append('</' + tag + '>')
                          else :
                              hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
                      else :
                          print(' a "graphic" region')
-                        (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                        (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                          if simgsrc:
                              hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))
  
diff --git a/DeDRM_plugin/flatxml2svg.py b/DeDRM_plugin/flatxml2svg.py

index 72c7e3c33fa294b72471a767f397e019d9c4d709..3768358e4d71bca9371ac03ff4ba03348ea59b2a 100644 (file)
--- a/DeDRM_plugin/flatxml2svg.py
+++ b/DeDRM_plugin/flatxml2svg.py
@@ -12,7 +12,7 @@ from struct import unpack
  class PParser(object):
      def __init__(self, gd, flatxml, meta_array):
          self.gd = gd
-        self.flatdoc = flatxml.split('\n')
+        self.flatdoc = flatxml.split(b'\n')
          self.docSize = len(self.flatdoc)
          self.temp = []
  
@@ -58,11 +58,11 @@ class PParser(object):
      def lineinDoc(self, pos) :
          if (pos >= 0) and (pos < self.docSize) :
              item = self.flatdoc[pos]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
              else :
                  name = item
-                argres = ''
+                argres = b''
          return name, argres
  
      # find tag in doc if within pos to end inclusive
@@ -75,11 +75,13 @@ class PParser(object):
          foundat = -1
          for j in range(pos, end):
              item = self.flatdoc[j]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
              else :
                  name = item
-                argres = ''
+                argres = b''
+            if (isinstance(tagpath,str)):
+                tagpath = tagpath.encode('utf-8')
              if name.endswith(tagpath) :
                  result = argres
                  foundat = j
@@ -103,9 +105,9 @@ class PParser(object):
          cnt = len(self.flatdoc)
          for j in range(cnt):
              item = self.flatdoc[j]
-            if item.find('=') >= 0:
-                (name, argt) = item.split('=')
-                argres = argt.split('|')
+            if item.find(b'=') >= 0:
+                (name, argt) = item.split(b'=')
+                argres = argt.split(b'|')
              else:
                  name = item
                  argres = []
@@ -120,15 +122,17 @@ class PParser(object):
      def getDataatPos(self, path, pos):
          result = None
          item = self.flatdoc[pos]
-        if item.find('=') >= 0:
-            (name, argt) = item.split('=')
-            argres = argt.split('|')
+        if item.find(b'=') >= 0:
+            (name, argt) = item.split(b'=')
+            argres = argt.split(b'|')
          else:
              name = item
              argres = []
          if (len(argres) > 0) :
              for j in range(0,len(argres)):
                  argres[j] = int(argres[j])
+        if (isinstance(path,str)):
+            path = path.encode('utf-8')
          if (name.endswith(path)):
              result = argres
          return result
@@ -138,12 +142,14 @@ class PParser(object):
          cnt = len(self.temp)
          for j in range(cnt):
              item = self.temp[j]
-            if item.find('=') >= 0:
-                (name, argt) = item.split('=')
-                argres = argt.split('|')
+            if item.find(b'=') >= 0:
+                (name, argt) = item.split(b'=')
+                argres = argt.split(b'|')
              else:
                  name = item
                  argres = []
+            if (isinstance(path,str)):
+                path = path.encode('utf-8')
              if (name.endswith(path)):
                  result = argres
                  self.temp.pop(j)
diff --git a/DeDRM_plugin/genbook.py b/DeDRM_plugin/genbook.py

index ea1ca387ddfb4232df84ad3dee71f9c664c1caff..915bd30837fc963b9ce7bdedcc2b1c5b1d901fca 100644 (file)
--- a/DeDRM_plugin/genbook.py
+++ b/DeDRM_plugin/genbook.py
@@ -44,10 +44,10 @@ if inCalibre :
      from calibre_plugins.dedrm import flatxml2svg
      from calibre_plugins.dedrm import stylexml2css
  else :
-    from . import convert2xml
-    from . import flatxml2html
-    from . import flatxml2svg
-    from . import stylexml2css
+    import convert2xml
+    import flatxml2html
+    import flatxml2svg
+    import stylexml2css
  
  # global switch
  buildXML = False
@@ -117,10 +117,10 @@ class Dictionary(object):
              self.stable.append(self.escapestr(readString(self.fo)))
          self.pos = 0
      def escapestr(self, str):
-        str = str.replace('&','&amp;')
-        str = str.replace('<','&lt;')
-        str = str.replace('>','&gt;')
-        str = str.replace('=','&#61;')
+        str = str.replace(b'&',b'&amp;')
+        str = str.replace(b'<',b'&lt;')
+        str = str.replace(b'>',b'&gt;')
+        str = str.replace(b'=',b'&#61;')
          return str
      def lookup(self,val):
          if ((val >= 0) and (val < self.size)) :
@@ -138,7 +138,7 @@ class Dictionary(object):
  
  class PageDimParser(object):
      def __init__(self, flatxml):
-        self.flatdoc = flatxml.split('\n')
+        self.flatdoc = flatxml.split(b'\n')
      # find tag if within pos to end inclusive
      def findinDoc(self, tagpath, pos, end) :
          result = None
@@ -151,8 +151,8 @@ class PageDimParser(object):
          foundat = -1
          for j in range(pos, end):
              item = docList[j]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=')
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=')
              else :
                  name = item
                  argres = ''
@@ -162,8 +162,8 @@ class PageDimParser(object):
                  break
          return foundat, result
      def process(self):
-        (pos, sph) = self.findinDoc('page.h',0,-1)
-        (pos, spw) = self.findinDoc('page.w',0,-1)
+        (pos, sph) = self.findinDoc(b'page.h',0,-1)
+        (pos, spw) = self.findinDoc(b'page.w',0,-1)
          if (sph == None): sph = '-1'
          if (spw == None): spw = '-1'
          return sph, spw
@@ -176,21 +176,21 @@ def getPageDim(flatxml):
  
  class GParser(object):
      def __init__(self, flatxml):
-        self.flatdoc = flatxml.split('\n')
+        self.flatdoc = flatxml.split(b'\n')
          self.dpi = 1440
-        self.gh = self.getData('info.glyph.h')
-        self.gw = self.getData('info.glyph.w')
-        self.guse = self.getData('info.glyph.use')
+        self.gh = self.getData(b'info.glyph.h')
+        self.gw = self.getData(b'info.glyph.w')
+        self.guse = self.getData(b'info.glyph.use')
          if self.guse :
              self.count = len(self.guse)
          else :
              self.count = 0
-        self.gvtx = self.getData('info.glyph.vtx')
-        self.glen = self.getData('info.glyph.len')
-        self.gdpi = self.getData('info.glyph.dpi')
-        self.vx = self.getData('info.vtx.x')
-        self.vy = self.getData('info.vtx.y')
-        self.vlen = self.getData('info.len.n')
+        self.gvtx = self.getData(b'info.glyph.vtx')
+        self.glen = self.getData(b'info.glyph.len')
+        self.gdpi = self.getData(b'info.glyph.dpi')
+        self.vx = self.getData(b'info.vtx.x')
+        self.vy = self.getData(b'info.vtx.y')
+        self.vlen = self.getData(b'info.len.n')
          if self.vlen :
              self.glen.append(len(self.vlen))
          elif self.glen:
@@ -204,9 +204,9 @@ class GParser(object):
          cnt = len(self.flatdoc)
          for j in range(cnt):
              item = self.flatdoc[j]
-            if item.find('=') >= 0:
-                (name, argt) = item.split('=')
-                argres = argt.split('|')
+            if item.find(b'=') >= 0:
+                (name, argt) = item.split(b'=')
+                argres = argt.split(b'|')
              else:
                  name = item
                  argres = []
@@ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage):
  
      # now get the css info
      cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
-    open(xname, 'wb').write(cssstr)
+    open(xname, 'w').write(cssstr)
      if buildXML:
          xname = os.path.join(xmlDir, 'other0000.xml')
          open(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
@@ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage):
      hlst.append('</body>\n</html>\n')
      htmlstr = "".join(hlst)
      hlst = None
-    open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+    open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr)
  
      print(" ")
      print('Extracting Table of Contents from Amazon OCR')
@@ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage):
      tlst.append('</body>\n')
      tlst.append('</html>\n')
      tochtml = "".join(tlst)
-    open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
+    open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml)
  
  
      # now create index_svg.xhtml that points to all required files
@@ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage):
          flst = []
          for page in pagelst:
              flst.append(xmllst[page])
-        flat_svg = "".join(flst)
+        flat_svg = b"".join(flst)
          flst=None
          svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
          if (raw) :
@@ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage):
      slst.append('</body>\n</html>\n')
      svgindex = "".join(slst)
      slst = None
-    open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
+    open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex)
  
      print(" ")
  
@@ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage):
      olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n')
      # adding metadata
      olst.append('   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n')
-    if 'GUID' in meta_array:
-        olst.append('      <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n')
-    if 'ASIN' in meta_array:
-        olst.append('      <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n')
-    if 'oASIN' in meta_array:
-        olst.append('      <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n')
-    olst.append('      <dc:title>' + meta_array['Title'] + '</dc:title>\n')
-    olst.append('      <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n')
+    if b'GUID' in meta_array:
+        olst.append('      <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n')
+    if b'ASIN' in meta_array:
+        olst.append('      <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n')
+    if b'oASIN' in meta_array:
+        olst.append('      <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n')
+    olst.append('      <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n')
+    olst.append('      <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n')
      olst.append('      <dc:language>en</dc:language>\n')
-    olst.append('      <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n')
+    olst.append('      <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n')
      if isCover:
          olst.append('      <meta name="cover" content="bookcover"/>\n')
      olst.append('   </metadata>\n')
@@ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage):
      olst.append('</package>\n')
      opfstr = "".join(olst)
      olst = None
-    open(opfname, 'wb').write(opfstr)
+    open(opfname, 'w').write(opfstr)
  
      print('Processing Complete')
  
diff --git a/DeDRM_plugin/kgenpids.py b/DeDRM_plugin/kgenpids.py

index 466cf5c002e3eee67e566a0c3fceaa01615966f6..86ffab7f4da947179f019cae78c14a0f38894c64 100644 (file)
--- a/DeDRM_plugin/kgenpids.py
+++ b/DeDRM_plugin/kgenpids.py
@@ -49,14 +49,15 @@ def SHA1(message):
  
  
  # Encode the bytes in data with the characters in map
+# data and map should be byte arrays
  def encode(data, map):
-    result = ''
+    result = b''
      for char in data:
-        value = ord(char)
+        value = char
          Q = (value ^ 0x80) // len(map)
          R = value % len(map)
-        result += map[Q]
-        result += map[R]
+        result += bytes([map[Q]])
+        result += bytes([map[R]])
      return result
  
  # Hash the bytes in data and then encode the digest with the characters in map
@@ -117,7 +118,7 @@ def generatePidEncryptionTable() :
  def generatePidSeed(table,dsn) :
      value = 0
      for counter in range (0,4) :
-        index = (ord(dsn[counter]) ^ value) &0xFF
+        index = (dsn[counter] ^ value) & 0xFF
          value = (value >> 8) ^ table[index]
      return value
  
@@ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll):
      pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
      index = 0
      for counter in range (0,nbRoll):
-        pid[index] = pid[index] ^ ord(dsn[counter])
+        pid[index] = pid[index] ^ dsn[counter]
          index = (index+1) %8
      for counter in range (0,8):
          index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
@@ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase):
  
      try:
          # Get the kindle account token, if present
-        kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode()
+        kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens'])
  
      except KeyError:
          kindleAccountToken=""
@@ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase):
  
      try:
          # Get the DSN token, if present
-        DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode()
+        DSN = bytearray.fromhex((kindleDatabase[1])['DSN'])
          print("Got DSN key from database {0}".format(kindleDatabase[0]))
      except KeyError:
          # See if we have the info to generate the DSN
          try:
              # Get the Mazama Random number
-            MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode()
+            MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber'])
              #print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0])
  
              try:
                  # Get the SerialNumber token, if present
-                IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode()
+                IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber'])
                  print("Got SerialNumber from database {0}".format(kindleDatabase[0]))
              except KeyError:
                   # Get the IDString we added
-                IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode()
+                IDString = bytearray.fromhex((kindleDatabase[1])['IDString'])
  
              try:
                  # Get the UsernameHash token, if present
-                encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode()
+                encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash'])
                  print("Got UsernameHash from database {0}".format(kindleDatabase[0]))
              except KeyError:
                  # Get the UserName we added
-                UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode()
+                UserName = bytearray.fromhex((kindleDatabase[1])['UserName'])
                  # encode it
                  encodedUsername = encodeHash(UserName,charMap1)
                  #print "encodedUsername",encodedUsername.encode('hex')
@@ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase):
      # Compute book PIDs
  
      # book pid
-    pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token)
+    pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
      bookPID = encodePID(pidHash)
      bookPID = checksumPid(bookPID)
      pids.append(bookPID)
  
      # variant 1
-    pidHash = SHA1(kindleAccountToken.encode()+rec209+token)
+    pidHash = SHA1(kindleAccountToken+rec209+token)
      bookPID = encodePID(pidHash)
      bookPID = checksumPid(bookPID)
      pids.append(bookPID)
  
      # variant 2
-    pidHash = SHA1(DSN.encode()+rec209+token)
+    pidHash = SHA1(DSN+rec209+token)
      bookPID = encodePID(pidHash)
      bookPID = checksumPid(bookPID)
      pids.append(bookPID)
diff --git a/DeDRM_plugin/mobidedrm.py b/DeDRM_plugin/mobidedrm.py

index e9b0fc148196a64dfc4a4fb662731956094b776c..ce21fbd03c78b770e392c4b8f5123b7997845c20 100644 (file)
--- a/DeDRM_plugin/mobidedrm.py
+++ b/DeDRM_plugin/mobidedrm.py
@@ -7,7 +7,7 @@
  
  from __future__ import print_function
  __license__ = 'GPL v3'
-__version__ = "1.00"
+__version__ = "1.0"
  
  # This is a python script. You need a Python interpreter to run it.
  # For example, ActiveState Python, which exists for windows.
@@ -73,7 +73,7 @@ __version__ = "1.00"
  #  0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility
  #  0.41 - Fixed potential unicode problem in command line calls
  #  0.42 - Added GPL v3 licence. updated/removed some print statements
-#  1.00 - Python 3 compatibility for calibre 5.0
+#  1.0  - Python 3 compatibility for calibre 5.0
  
  import sys
  import os
@@ -330,7 +330,7 @@ class MobiBook:
          }
          title = ''
          codec = 'windows-1252'
-        if self.magic == 'BOOKMOBI':
+        if self.magic == b'BOOKMOBI':
              if 503 in self.meta_array:
                  title = self.meta_array[503]
              else:
diff --git a/DeDRM_plugin/stylexml2css.py b/DeDRM_plugin/stylexml2css.py

index 3e360a45713f91cc244bb31f9eac779eca86e152..1d46a9e5f037376ce39c5fca00aeb860a417c0ce 100644 (file)
--- a/DeDRM_plugin/stylexml2css.py
+++ b/DeDRM_plugin/stylexml2css.py
@@ -15,36 +15,36 @@ debug = False
  
  class DocParser(object):
      def __init__(self, flatxml, fontsize, ph, pw):
-        self.flatdoc = flatxml.split('\n')
+        self.flatdoc = flatxml.split(b'\n')
          self.fontsize = int(fontsize)
          self.ph = int(ph) * 1.0
          self.pw = int(pw) * 1.0
  
      stags = {
-        'paragraph' : 'p',
-        'graphic'   : '.graphic'
+        b'paragraph' : 'p',
+        b'graphic'   : '.graphic'
      }
  
      attr_val_map = {
-        'hang'            : 'text-indent: ',
-        'indent'          : 'text-indent: ',
-        'line-space'      : 'line-height: ',
-        'margin-bottom'   : 'margin-bottom: ',
-        'margin-left'     : 'margin-left: ',
-        'margin-right'    : 'margin-right: ',
-        'margin-top'      : 'margin-top: ',
-        'space-after'     : 'padding-bottom: ',
+        b'hang'            : 'text-indent: ',
+        b'indent'          : 'text-indent: ',
+        b'line-space'      : 'line-height: ',
+        b'margin-bottom'   : 'margin-bottom: ',
+        b'margin-left'     : 'margin-left: ',
+        b'margin-right'    : 'margin-right: ',
+        b'margin-top'      : 'margin-top: ',
+        b'space-after'     : 'padding-bottom: ',
      }
  
      attr_str_map = {
-        'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
-        'align-left'   : 'text-align: left;',
-        'align-right'  : 'text-align: right;',
-        'align-justify' : 'text-align: justify;',
-        'display-inline' : 'display: inline;',
-        'pos-left' : 'text-align: left;',
-        'pos-right' : 'text-align: right;',
-        'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+        b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+        b'align-left'   : 'text-align: left;',
+        b'align-right'  : 'text-align: right;',
+        b'align-justify' : 'text-align: justify;',
+        b'display-inline' : 'display: inline;',
+        b'pos-left' : 'text-align: left;',
+        b'pos-right' : 'text-align: right;',
+        b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
      }
  
  
@@ -60,11 +60,13 @@ class DocParser(object):
          foundat = -1
          for j in range(pos, end):
              item = docList[j]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
              else :
                  name = item
-                argres = ''
+                argres = b''
+            if (isinstance(tagpath,str)):
+                tagpath = tagpath.encode('utf-8')
              if name.endswith(tagpath) :
                  result = argres
                  foundat = j
@@ -76,7 +78,7 @@ class DocParser(object):
      def posinDoc(self, tagpath):
          startpos = []
          pos = 0
-        res = ""
+        res = b""
          while res != None :
              (foundpos, res) = self.findinDoc(tagpath, pos, -1)
              if res != None :
@@ -87,11 +89,11 @@ class DocParser(object):
      # returns a vector of integers for the tagpath
      def getData(self, tagpath, pos, end, clean=False):
          if clean:
-            digits_only = re.compile(r'''([0-9]+)''')
+            digits_only = re.compile(rb'''([0-9]+)''')
          argres=[]
          (foundat, argt) = self.findinDoc(tagpath, pos, end)
          if (argt != None) and (len(argt) > 0) :
-            argList = argt.split('|')
+            argList = argt.split(b'|')
              for strval in argList:
                  if clean:
                      m = re.search(digits_only, strval)
@@ -109,7 +111,7 @@ class DocParser(object):
          csspage += '.cl-justify { text-align: justify; }\n'
  
          # generate a list of each <style> starting point in the stylesheet
-        styleList= self.posinDoc('book.stylesheet.style')
+        styleList= self.posinDoc(b'book.stylesheet.style')
          stylecnt = len(styleList)
          styleList.append(-1)
  
@@ -121,30 +123,30 @@ class DocParser(object):
              start = styleList[j]
              end = styleList[j+1]
  
-            (pos, tag) = self.findinDoc('style._tag',start,end)
+            (pos, tag) = self.findinDoc(b'style._tag',start,end)
              if tag == None :
-                (pos, tag) = self.findinDoc('style.type',start,end)
+                (pos, tag) = self.findinDoc(b'style.type',start,end)
  
              # Is this something we know how to convert to css
              if tag in self.stags :
  
                  # get the style class
-                (pos, sclass) = self.findinDoc('style.class',start,end)
+                (pos, sclass) = self.findinDoc(b'style.class',start,end)
                  if sclass != None:
-                    sclass = sclass.replace(' ','-')
-                    sclass = '.cl-' + sclass.lower()
+                    sclass = sclass.replace(b' ',b'-')
+                    sclass = b'.cl-' + sclass.lower()
                  else :
-                    sclass = ''
+                    sclass = b''
  
                  if debug: print('sclass', sclass)
  
                  # check for any "after class" specifiers
-                (pos, aftclass) = self.findinDoc('style._after_class',start,end)
+                (pos, aftclass) = self.findinDoc(b'style._after_class',start,end)
                  if aftclass != None:
-                    aftclass = aftclass.replace(' ','-')
-                    aftclass = '.cl-' + aftclass.lower()
+                    aftclass = aftclass.replace(b' ',b'-')
+                    aftclass = b'.cl-' + aftclass.lower()
                  else :
-                    aftclass = ''
+                    aftclass = b''
  
                  if debug: print('aftclass', aftclass)
  
@@ -152,34 +154,37 @@ class DocParser(object):
  
                  while True :
  
-                    (pos1, attr) = self.findinDoc('style.rule.attr', start, end)
-                    (pos2, val) = self.findinDoc('style.rule.value', start, end)
+                    (pos1, attr) = self.findinDoc(b'style.rule.attr', start, end)
+                    (pos2, val) = self.findinDoc(b'style.rule.value', start, end)
  
                      if debug: print('attr', attr)
                      if debug: print('val', val)
  
                      if attr == None : break
  
-                    if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
+                    if (attr == b'display') or (attr == b'pos') or (attr == b'align'):
                          # handle text based attributess
-                        attr = attr + '-' + val
+                        attr = attr + b'-' + val
                          if attr in self.attr_str_map :
-                            cssargs[attr] = (self.attr_str_map[attr], '')
+                            cssargs[attr] = (self.attr_str_map[attr], b'')
                      else :
                          # handle value based attributes
                          if attr in self.attr_val_map :
                              name = self.attr_val_map[attr]
-                            if attr in ('margin-bottom', 'margin-top', 'space-after') :
+                            if attr in (b'margin-bottom', b'margin-top', b'space-after') :
                                  scale = self.ph
-                            elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
+                            elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') :
                                  scale = self.pw
-                            elif attr == 'line-space':
+                            elif attr == b'line-space':
                                  scale = self.fontsize * 2.0
+                            else:
+                                print("Scale not defined!")
+                                scale = 1.0
  
                              if val == "":
                                  val = 0
  
-                            if not ((attr == 'hang') and (int(val) == 0)):
+                            if not ((attr == b'hang') and (int(val) == 0)):
                                  try:
                                      f = float(val)
                                  except:
@@ -198,32 +203,32 @@ class DocParser(object):
                      if debug: print('keeping style')
                      # make sure line-space does not go below 100% or above 300% since
                      # it can be wacky in some styles
-                    if 'line-space' in cssargs:
-                        seg = cssargs['line-space'][0]
-                        val = cssargs['line-space'][1]
+                    if b'line-space' in cssargs:
+                        seg = cssargs[b'line-space'][0]
+                        val = cssargs[b'line-space'][1]
                          if val < 1.0: val = 1.0
                          if val > 3.0: val = 3.0
-                        del cssargs['line-space']
-                        cssargs['line-space'] = (self.attr_val_map['line-space'], val)
+                        del cssargs[b'line-space']
+                        cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val)
  
  
                      # handle modifications for css style hanging indents
-                    if 'hang' in cssargs:
-                        hseg = cssargs['hang'][0]
-                        hval = cssargs['hang'][1]
-                        del cssargs['hang']
-                        cssargs['hang'] = (self.attr_val_map['hang'], -hval)
+                    if b'hang' in cssargs:
+                        hseg = cssargs[b'hang'][0]
+                        hval = cssargs[b'hang'][1]
+                        del cssargs[b'hang']
+                        cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval)
                          mval = 0
                          mseg = 'margin-left: '
                          mval = hval
-                        if 'margin-left' in cssargs:
-                            mseg = cssargs['margin-left'][0]
-                            mval = cssargs['margin-left'][1]
+                        if b'margin-left' in cssargs:
+                            mseg = cssargs[b'margin-left'][0]
+                            mval = cssargs[b'margin-left'][1]
                              if mval < 0: mval = 0
                              mval = hval + mval
-                        cssargs['margin-left'] = (mseg, mval)
-                        if 'indent' in cssargs:
-                            del cssargs['indent']
+                        cssargs[b'margin-left'] = (mseg, mval)
+                        if b'indent' in cssargs:
+                            del cssargs[b'indent']
  
                      cssline = sclass + ' { '
                      for key in iter(cssargs):
diff --git a/DeDRM_plugin/topazextract.py b/DeDRM_plugin/topazextract.py

index 1eb6b23ab5921d606e9cffa04f86dbb686a0518c..5125d6238ae4f412987bf2fb0cfd9ed301d5b934 100644 (file)
--- a/DeDRM_plugin/topazextract.py
+++ b/DeDRM_plugin/topazextract.py
@@ -173,7 +173,7 @@ def decryptRecord(data,PID):
  def decryptDkeyRecord(data,PID):
      record = decryptRecord(data,PID)
      fields = unpack('3sB8sB8s3s',record)
-    if fields[0] != 'PID' or fields[5] != 'pid' :
+    if fields[0] != b'PID' or fields[5] != b'pid' :
          raise DrmException("Didn't find PID magic numbers in record")
      elif fields[1] != 8 or fields[3] != 8 :
          raise DrmException("Record didn't contain correct length fields")
@@ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID):
  
  # Decrypt all dkey records (contain the book PID)
  def decryptDkeyRecords(data,PID):
-    nbKeyRecords = ord(data[0])
+    nbKeyRecords = data[0]
      records = []
      data = data[1:]
      for i in range (0,nbKeyRecords):
-        length = ord(data[0])
+        length = data[0]
          try:
              key = decryptDkeyRecord(data[1:length+1],PID)
              records.append(key)
@@ -209,7 +209,7 @@ class TopazBook:
          self.bookMetadata = {}
          self.bookKey = None
          magic = unpack('4s',self.fo.read(4))[0]
-        if magic != 'TPZ0':
+        if magic != b'TPZ0':
              raise DrmException("Parse Error : Invalid Header, not a Topaz file")
          self.parseTopazHeaders()
          self.parseMetadata()
@@ -244,9 +244,9 @@ class TopazBook:
  
      def parseMetadata(self):
          # Parse the metadata record from the book payload and return a list of [key,values]
-        self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0])
+        self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0])
          tag = bookReadString(self.fo)
-        if tag != 'metadata' :
+        if tag != b'metadata' :
              raise DrmException("Parse Error : Record Names Don't Match")
          flags = ord(self.fo.read(1))
          nbRecords = ord(self.fo.read(1))
@@ -260,18 +260,18 @@ class TopazBook:
          return self.bookMetadata
  
      def getPIDMetaInfo(self):
-        keysRecord = self.bookMetadata.get('keys','')
-        keysRecordRecord = ''
-        if keysRecord != '':
-            keylst = keysRecord.split(',')
+        keysRecord = self.bookMetadata.get(b'keys',b'')
+        keysRecordRecord = b''
+        if keysRecord != b'':
+            keylst = keysRecord.split(b',')
              for keyval in keylst:
-                keysRecordRecord += self.bookMetadata.get(keyval,'')
+                keysRecordRecord += self.bookMetadata.get(keyval,b'')
          return keysRecord, keysRecordRecord
  
      def getBookTitle(self):
-        title = ''
-        if 'Title' in self.bookMetadata:
-            title = self.bookMetadata['Title']
+        title = b''
+        if b'Title' in self.bookMetadata:
+            title = self.bookMetadata[b'Title']
          return title.decode('utf-8')
  
      def setBookKey(self, key):
@@ -323,7 +323,7 @@ class TopazBook:
          raw = 0
          fixedimage=True
          try:
-            keydata = self.getBookPayloadRecord('dkey', 0)
+            keydata = self.getBookPayloadRecord(b'dkey', 0)
          except DrmException as e:
              print("no dkey record found, book may not be encrypted")
              print("attempting to extrct files without a book key")
@@ -354,7 +354,7 @@ class TopazBook:
                  pass
              else:
                  bookKey = bookKeys[0]
-                print("Book Key Found! ({0})".format(bookKey.encode('hex')))
+                print("Book Key Found! ({0})".format(bookKey.hex()))
                  break
  
          if not bookKey:
@@ -396,26 +396,26 @@ class TopazBook:
          outdir = self.outdir
          for headerRecord in self.bookHeaderRecords:
              name = headerRecord
-            if name != 'dkey':
+            if name != b'dkey':
                  ext = ".dat"
-                if name == 'img': ext = ".jpg"
-                if name == 'color' : ext = ".jpg"
-                print("Processing Section: {0}\n. . .".format(name), end=' ')
+                if name == b'img': ext = ".jpg"
+                if name == b'color' : ext = ".jpg"
+                print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ')
                  for index in range (0,len(self.bookHeaderRecords[name])) :
-                    fname = "{0}{1:04d}{2}".format(name,index,ext)
+                    fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext)
                      destdir = outdir
-                    if name == 'img':
+                    if name == b'img':
                          destdir =  os.path.join(outdir,"img")
-                    if name == 'color':
+                    if name == b'color':
                          destdir =  os.path.join(outdir,"color_img")
-                    if name == 'page':
+                    if name == b'page':
                          destdir =  os.path.join(outdir,"page")
-                    if name == 'glyphs':
+                    if name == b'glyphs':
                          destdir =  os.path.join(outdir,"glyphs")
                      outputFile = os.path.join(destdir,fname)
                      print(".", end=' ')
                      record = self.getBookPayloadRecord(name,index)
-                    if record != '':
+                    if record != b'':
                          open(outputFile, 'wb').write(record)
                  print(" ")
author	Apprentice Harper <apprenticeharper@gmail.com>
	Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)
committer	Apprentice Harper <apprenticeharper@gmail.com>
	Fri, 16 Oct 2020 12:58:59 +0000 (13:58 +0100)
DeDRM_plugin/convert2xml.py		patch \| blob \| blame \| history
DeDRM_plugin/flatxml2html.py		patch \| blob \| blame \| history
DeDRM_plugin/flatxml2svg.py		patch \| blob \| blame \| history
DeDRM_plugin/genbook.py		patch \| blob \| blame \| history
DeDRM_plugin/kgenpids.py		patch \| blob \| blame \| history
DeDRM_plugin/mobidedrm.py		patch \| blob \| blame \| history
DeDRM_plugin/stylexml2css.py		patch \| blob \| blame \| history
DeDRM_plugin/topazextract.py		patch \| blob \| blame \| history