topaz tools 1.0 (I think)

author some_updates <some_updates@gmail.com>

Sun, 17 Jan 2010 12:10:35 +0000 (12:10 +0000)

committer Apprentice Alf <apprenticealf@gmail.com>

Sat, 28 Feb 2015 12:11:14 +0000 (12:11 +0000)
author some_updates <some_updates@gmail.com>
Sun, 17 Jan 2010 12:10:35 +0000 (12:10 +0000)
committer Apprentice Alf <apprenticealf@gmail.com>
Sat, 28 Feb 2015 12:11:14 +0000 (12:11 +0000)
diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py

new file mode 100644 (file)

index 0000000..9cd32de
--- /dev/null
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@@ -0,0 +1,865 @@
+#! /usr/bin/python
+
+"""
+
+Comprehensive Mazama Book DRM with Topaz Cryptography V2.0
+
+-----BEGIN PUBLIC KEY-----
+MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6
+M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC
+B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx
+y2/pHuYme7U1TsgSjwIDAQAB
+-----END PUBLIC KEY-----
+
+"""
+
+from __future__ import with_statement
+
+import csv
+import sys
+import os
+import getopt
+import zlib
+from struct import pack
+from struct import unpack
+from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
+    create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
+    string_at, Structure, c_void_p, cast
+import _winreg as winreg
+import Tkinter
+import Tkconstants
+import tkMessageBox
+import traceback
+import hashlib
+
+MAX_PATH = 255
+
+kernel32 = windll.kernel32
+advapi32 = windll.advapi32
+crypt32 = windll.crypt32
+
+global kindleDatabase
+global bookFile
+global bookPayloadOffset
+global bookHeaderRecords
+global bookMetadata
+global bookKey
+global command
+
+#
+# Various character maps used to decrypt books. Probably supposed to act as obfuscation
+#
+
+charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
+charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
+charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
+
+#
+# Exceptions for all the problems that might happen during the script
+#
+
+class CMBDTCError(Exception):
+    pass
+    
+class CMBDTCFatal(Exception):
+    pass
+    
+#
+# Stolen stuff
+#
+
+class DataBlob(Structure):
+    _fields_ = [('cbData', c_uint),
+                ('pbData', c_void_p)]
+DataBlob_p = POINTER(DataBlob)
+
+def GetSystemDirectory():
+    GetSystemDirectoryW = kernel32.GetSystemDirectoryW
+    GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
+    GetSystemDirectoryW.restype = c_uint
+    def GetSystemDirectory():
+        buffer = create_unicode_buffer(MAX_PATH + 1)
+        GetSystemDirectoryW(buffer, len(buffer))
+        return buffer.value
+    return GetSystemDirectory
+GetSystemDirectory = GetSystemDirectory()
+
+
+def GetVolumeSerialNumber():
+    GetVolumeInformationW = kernel32.GetVolumeInformationW
+    GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
+                                      POINTER(c_uint), POINTER(c_uint),
+                                      POINTER(c_uint), c_wchar_p, c_uint]
+    GetVolumeInformationW.restype = c_uint
+    def GetVolumeSerialNumber(path):
+        vsn = c_uint(0)
+        GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
+        return vsn.value
+    return GetVolumeSerialNumber
+GetVolumeSerialNumber = GetVolumeSerialNumber()
+
+
+def GetUserName():
+    GetUserNameW = advapi32.GetUserNameW
+    GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
+    GetUserNameW.restype = c_uint
+    def GetUserName():
+        buffer = create_unicode_buffer(32)
+        size = c_uint(len(buffer))
+        while not GetUserNameW(buffer, byref(size)):
+            buffer = create_unicode_buffer(len(buffer) * 2)
+            size.value = len(buffer)
+        return buffer.value.encode('utf-16-le')[::2]
+    return GetUserName
+GetUserName = GetUserName()
+
+
+def CryptUnprotectData():
+    _CryptUnprotectData = crypt32.CryptUnprotectData
+    _CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
+                                   c_void_p, c_void_p, c_uint, DataBlob_p]
+    _CryptUnprotectData.restype = c_uint
+    def CryptUnprotectData(indata, entropy):
+        indatab = create_string_buffer(indata)
+        indata = DataBlob(len(indata), cast(indatab, c_void_p))
+        entropyb = create_string_buffer(entropy)
+        entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
+        outdata = DataBlob()
+        if not _CryptUnprotectData(byref(indata), None, byref(entropy),
+                                   None, None, 0, byref(outdata)):
+            raise CMBDTCFatal("Failed to Unprotect Data")
+        return string_at(outdata.pbData, outdata.cbData)
+    return CryptUnprotectData
+CryptUnprotectData = CryptUnprotectData()
+
+#
+# Returns the MD5 digest of "message"
+#
+
+def MD5(message):
+    ctx = hashlib.md5()
+    ctx.update(message)
+    return ctx.digest()
+
+#
+# Returns the MD5 digest of "message"
+#
+
+def SHA1(message):
+    ctx = hashlib.sha1()
+    ctx.update(message)
+    return ctx.digest()
+
+#
+# Open the book file at path
+#
+
+def openBook(path):
+    try:
+        return open(path,'rb')
+    except:
+        raise CMBDTCFatal("Could not open book file: " + path)
+#
+# Encode the bytes in data with the characters in map
+#
+
+def encode(data, map):
+    result = ""
+    for char in data:
+        value = ord(char)
+        Q = (value ^ 0x80) // len(map)
+        R = value % len(map)
+        result += map[Q]
+        result += map[R]
+    return result
+  
+#
+# Hash the bytes in data and then encode the digest with the characters in map
+#
+  
+def encodeHash(data,map):
+    return encode(MD5(data),map)
+
+#
+# Decode the string in data with the characters in map. Returns the decoded bytes
+#
+   
+def decode(data,map):
+    result = ""
+    for i in range (0,len(data),2):
+        high = map.find(data[i])
+        low = map.find(data[i+1])
+        value = (((high * 0x40) ^ 0x80) & 0xFF) + low
+        result += pack("B",value)
+    return result
+  
+#
+# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application)
+#
+  
+def openKindleInfo():
+    regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
+    path = winreg.QueryValueEx(regkey, 'Local AppData')[0] 
+    return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
+
+#
+# Parse the Kindle.info file and return the records as a list of key-values
+#
+
+def parseKindleInfo():
+    DB = {}
+    infoReader = openKindleInfo()
+    infoReader.read(1)
+    data = infoReader.read()
+    items = data.split('{')
+    
+    for item in items:
+        splito = item.split(':')
+        DB[splito[0]] =splito[1]
+    return DB
+
+#
+# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal)
+#
+ 
+def findNameForHash(hash):
+    names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
+    result = ""
+    for name in names:
+        if hash == encodeHash(name, charMap2):
+           result = name
+           break
+    return name
+    
+#
+# Print all the records from the kindle.info file (option -i)
+#
+    
+def printKindleInfo():
+    for record in kindleDatabase:
+        name = findNameForHash(record)
+        if name != "" :
+            print (name)
+            print ("--------------------------\n")
+        else :
+            print ("Unknown Record")
+        print getKindleInfoValueForHash(record)
+        print "\n"
+#
+# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
+#
+
+def getKindleInfoValueForHash(hashedKey):
+    global kindleDatabase
+    encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
+    return CryptUnprotectData(encryptedValue,"")
+ 
+#
+#  Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
+#
+   
+def getKindleInfoValueForKey(key):
+    return getKindleInfoValueForHash(encodeHash(key,charMap2))
+  
+#
+# Get a 7 bit encoded number from the book file
+#
+
+def bookReadEncodedNumber():
+    flag = False
+    data = ord(bookFile.read(1))
+    
+    if data == 0xFF:
+       flag = True
+       data = ord(bookFile.read(1))
+       
+    if data >= 0x80:
+        datax = (data & 0x7F)
+        while data >= 0x80 :
+            data = ord(bookFile.read(1))
+            datax = (datax <<7) + (data & 0x7F)
+        data = datax 
+    
+    if flag:
+       data = -data
+    return data
+    
+#
+# Encode a number in 7 bit format
+#
+
+def encodeNumber(number):
+   result = ""
+   negative = False
+   flag = 0
+   
+   if number < 0 :
+       number = -number + 1
+       negative = True
+   
+   while True:
+       byte = number & 0x7F
+       number = number >> 7
+       byte += flag
+       result += chr(byte)
+       flag = 0x80
+       if number == 0 : break
+   
+   if negative:
+       result += chr(0xFF)
+   
+   return result[::-1]
+  
+#
+# Get a length prefixed string from the file 
+#
+
+def bookReadString():
+    stringLength = bookReadEncodedNumber()
+    return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]  
+    
+#
+# Returns a length prefixed string
+#
+
+def lengthPrefixString(data):
+    return encodeNumber(len(data))+data
+    
+
+#
+# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
+#
+    
+def bookReadHeaderRecordData():
+    nbValues = bookReadEncodedNumber()
+    values = []
+    for i in range (0,nbValues):
+        values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
+    return values
+   
+#
+# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
+#
+
+def parseTopazHeaderRecord():
+    if ord(bookFile.read(1)) != 0x63:
+        raise CMBDTCFatal("Parse Error : Invalid Header")
+    
+    tag = bookReadString()
+    record = bookReadHeaderRecordData()
+    return [tag,record]
+
+#
+# Parse the header of a Topaz file, get all the header records and the offset for the payload
+#
+ 
+def parseTopazHeader():
+    global bookHeaderRecords
+    global bookPayloadOffset
+    magic = unpack("4s",bookFile.read(4))[0]
+    
+    if magic != 'TPZ0':
+        raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
+        
+    nbRecords = bookReadEncodedNumber()
+    bookHeaderRecords = {}
+   
+    for i in range (0,nbRecords):
+        result = parseTopazHeaderRecord()
+        print result[0], result[1]
+        bookHeaderRecords[result[0]] = result[1]
+    
+    if ord(bookFile.read(1))  != 0x64 :
+        raise CMBDTCFatal("Parse Error : Invalid Header")
+    
+    bookPayloadOffset = bookFile.tell()
+   
+#
+# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
+# Correction, the record is correctly decompressed too
+#
+
+def getBookPayloadRecord(name, index):   
+    encrypted = False
+    compressed = False
+
+    try: 
+        recordOffset = bookHeaderRecords[name][index][0]
+    except:
+        raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
+    
+    bookFile.seek(bookPayloadOffset + recordOffset)
+    
+    tag = bookReadString()
+    if tag != name :
+        raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
+    
+    recordIndex = bookReadEncodedNumber()
+    
+    if recordIndex < 0 :
+        encrypted = True
+        recordIndex = -recordIndex -1
+    
+    if recordIndex != index :
+      raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
+            
+    if (bookHeaderRecords[name][index][2] > 0):
+        compressed = True
+        record = bookFile.read(bookHeaderRecords[name][index][2])
+    else:
+        record = bookFile.read(bookHeaderRecords[name][index][1])
+ 
+    if encrypted:
+       ctx = topazCryptoInit(bookKey)
+       record = topazCryptoDecrypt(record,ctx)
+
+    if compressed:
+        record = zlib.decompress(record)
+    
+    return record
+
+#
+# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
+#
+
+def extractBookPayloadRecord(name, index, filename):
+    compressed = False
+
+    try:
+        compressed = bookHeaderRecords[name][index][2] != 0
+        record = getBookPayloadRecord(name,index)
+    except:
+        print("Could not find record")
+    
+    # if compressed:
+    #    try:
+    #        record = zlib.decompress(record)
+    #    except:
+    #        raise CMBDTCFatal("Could not decompress record")
+            
+    if filename != "":
+        try:
+            file = open(filename,"wb")
+            file.write(record)
+            file.close()
+        except:
+            raise CMBDTCFatal("Could not write to destination file")
+    else:
+        print(record)
+    
+#
+# return next record [key,value] from the book metadata from the current book position
+#  
+
+def readMetadataRecord():
+    return [bookReadString(),bookReadString()]
+    
+#
+# Parse the metadata record from the book payload and return a list of [key,values]
+#
+
+def parseMetadata():
+    global bookHeaderRecords
+    global bookPayloadAddress
+    global bookMetadata
+    bookMetadata = {}
+    bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
+    tag = bookReadString()
+    if tag != "metadata" :
+        raise CMBDTCFatal("Parse Error : Record Names Don't Match")
+    
+    flags = ord(bookFile.read(1))
+    nbRecords = ord(bookFile.read(1))
+    
+    for i in range (0,nbRecords) :
+        record =readMetadataRecord()
+        bookMetadata[record[0]] = record[1]
+
+#
+# Returns two bit at offset from a bit field
+#
+   
+def getTwoBitsFromBitField(bitField,offset):
+    byteNumber = offset // 4
+    bitPosition = 6 - 2*(offset % 4)
+    
+    return ord(bitField[byteNumber]) >> bitPosition & 3
+
+#
+# Returns the six bits at offset from a bit field
+#    
+
+def getSixBitsFromBitField(bitField,offset):
+     offset *= 3
+     value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
+     return value
+     
+#
+# 8 bits to six bits encoding from hash to generate PID string
+#
+
+def encodePID(hash):
+    global charMap3
+    PID = ""
+    for position in range (0,8):
+        PID += charMap3[getSixBitsFromBitField(hash,position)]
+    return PID
+    
+#
+# Context initialisation for the Topaz Crypto
+#
+
+def topazCryptoInit(key):
+    ctx1 = 0x0CAFFE19E
+    
+    for keyChar in key:
+        keyByte = ord(keyChar)
+        ctx2 = ctx1 
+        ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
+    return [ctx1,ctx2]
+    
+#
+# decrypt data with the context prepared by topazCryptoInit()
+#
+    
+def topazCryptoDecrypt(data, ctx):
+    ctx1 = ctx[0]
+    ctx2 = ctx[1]
+    
+    plainText = ""
+    
+    for dataChar in data:
+        dataByte = ord(dataChar)
+        m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
+        ctx2 = ctx1
+        ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
+        plainText += chr(m)
+        
+    return plainText
+
+#
+# Decrypt a payload record with the PID
+#
+
+def decryptRecord(data,PID):
+    ctx = topazCryptoInit(PID)
+    return topazCryptoDecrypt(data, ctx)
+
+#
+# Try to decrypt a dkey record (contains the book PID)
+#
+
+def decryptDkeyRecord(data,PID):
+    record = decryptRecord(data,PID)
+    fields = unpack("3sB8sB8s3s",record)
+    
+    if fields[0] != "PID" or fields[5] != "pid" :
+        raise CMBDTCError("Didn't find PID magic numbers in record")
+    elif fields[1] != 8 or fields[3] != 8 :
+        raise CMBDTCError("Record didn't contain correct length fields")
+    elif fields[2] != PID :
+        raise CMBDTCError("Record didn't contain PID")
+    
+    return fields[4]
+    
+#
+# Decrypt all the book's dkey records (contain the book PID)
+#
+  
+def decryptDkeyRecords(data,PID):
+    nbKeyRecords = ord(data[0])
+    records = []
+    data = data[1:]
+    for i in range (0,nbKeyRecords):
+        length = ord(data[0])
+        try:
+            key = decryptDkeyRecord(data[1:length+1],PID)
+            records.append(key)
+        except CMBDTCError:
+            pass
+        data = data[1+length:]
+        
+    return records
+    
+#
+# Encryption table used to generate the device PID
+#
+    
+def generatePidEncryptionTable() :
+    table = []
+    for counter1 in range (0,0x100):
+        value = counter1
+        for counter2 in range (0,8):
+            if (value & 1 == 0) :
+                value = value >> 1
+            else :
+                value = value >> 1
+                value = value ^ 0xEDB88320
+        table.append(value)
+    return table
+ 
+#
+# Seed value used to generate the device PID
+#
+   
+def generatePidSeed(table,dsn) :
+    value = 0
+    for counter in range (0,4) :
+       index = (ord(dsn[counter]) ^ value) &0xFF
+       value = (value >> 8) ^ table[index]
+    return value
+   
+#
+# Generate the device PID
+#
+
+def generateDevicePID(table,dsn,nbRoll):
+    seed = generatePidSeed(table,dsn)
+    pidAscii = ""
+    pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
+    index = 0
+    
+    for counter in range (0,nbRoll):
+        pid[index] = pid[index] ^ ord(dsn[counter])
+        index = (index+1) %8
+ 
+    for counter in range (0,8):
+        index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
+        pidAscii += charMap4[index]
+    return pidAscii
+    
+#
+# Create decrypted book payload
+#
+
+def createDecryptedPayload(payload):
+    for headerRecord in bookHeaderRecords:
+       name = headerRecord
+       if name != "dkey" :
+           ext = '.dat'
+           if name == 'img' : ext = '.jpg'
+           for index in range (0,len(bookHeaderRecords[name])) :
+               fnum = "%04d" % index
+               fname = name + fnum + ext
+               destdir = payload
+               if name == 'img':
+                   destdir =  os.path.join(payload,'img')
+               if name == 'page':
+                   destdir =  os.path.join(payload,'page')
+               if name == 'glyphs':
+                   destdir =  os.path.join(payload,'glyphs')
+               outputFile = os.path.join(destdir,fname)
+               file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
+                   
+
+# Create decrypted book
+#
+
+def createDecryptedBook(outdir):
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    destdir =  os.path.join(outdir,'img')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    destdir =  os.path.join(outdir,'page')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    destdir =  os.path.join(outdir,'glyphs')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    createDecryptedPayload(outdir)
+
+
+#
+# Set the command to execute by the programm according to cmdLine parameters
+#
+
+def setCommand(name) :
+    global command
+    if command != "" :
+         raise CMBDTCFatal("Invalid command line parameters")
+    else :
+        command = name
+
+# 
+# Program usage
+#
+   
+def usage():
+    print("\nUsage:")
+    print("\ncmbtc_dump.py [options] bookFileName\n")
+    print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
+    print("-d Dumps the unencrypted book as files to outdir")
+    print("-o Output directory to save book files to")
+    print("-v Verbose (can be used several times)")
+    print("-i Prints kindle.info database")
+ 
+#
+# Main
+#   
+
+def main(argv=sys.argv):
+    global kindleDatabase
+    global bookMetadata
+    global bookKey
+    global bookFile
+    global command
+    
+    progname = os.path.basename(argv[0])
+    
+    verbose = 0
+    recordName = ""
+    recordIndex = 0
+    outdir = ""
+    PIDs = []
+    kindleDatabase = None
+    command = ""
+    
+    
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "vi:o:p:d")
+    except getopt.GetoptError, err:
+        # print help information and exit:
+        print str(err) # will print something like "option -a not recognized"
+        usage()
+        sys.exit(2)
+    
+    if len(opts) == 0 and len(args) == 0 :
+        usage()
+        sys.exit(2) 
+       
+    for o, a in opts:
+        if o == "-v":
+            verbose+=1
+        if o == "-i":
+            setCommand("printInfo")
+        if o =="-o":
+            if a == None :
+                raise CMBDTCFatal("Invalid parameter for -o")
+            outdir = a
+        if o =="-p":
+            PIDs.append(a)
+        if o =="-d":
+            setCommand("doit")
+            
+    if command == "" :
+        raise CMBDTCFatal("No action supplied on command line")
+   
+    #
+    # Read the encrypted database
+    #
+    
+    try:
+        kindleDatabase = parseKindleInfo()
+    except Exception as message:
+        if verbose>0:
+            print(message)
+    
+    if kindleDatabase != None :
+        if command == "printInfo" :
+            printKindleInfo()
+     
+    #
+    # Compute the DSN
+    #
+    
+    # Get the Mazama Random number
+        MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
+    
+    # Get the HDD serial
+        encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1)
+    
+    # Get the current user name
+        encodedUsername = encodeHash(GetUserName(),charMap1)
+    
+    # concat, hash and encode
+        DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
+       
+        if verbose >1:
+            print("DSN: " + DSN)
+    
+    #
+    # Compute the device PID
+    #
+     
+        table =  generatePidEncryptionTable()
+        devicePID = generateDevicePID(table,DSN,4)
+        PIDs.append(devicePID)
+    
+        if verbose > 0:
+            print("Device PID: " + devicePID)
+    
+    #
+    # Open book and parse metadata
+    #
+        
+    if len(args) == 1:
+    
+        bookFile = openBook(args[0])
+        parseTopazHeader()
+        parseMetadata()
+    
+    #
+    # Compute book PID
+    # 
+    
+    # Get the account token
+    
+        if kindleDatabase != None:
+            kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
+    
+            if verbose >1:
+                print("Account Token: " + kindleAccountToken)
+
+            keysRecord = bookMetadata["keys"]
+            keysRecordRecord = bookMetadata[keysRecord]
+    
+            pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord)
+   
+            bookPID = encodePID(pidHash)
+            PIDs.append(bookPID)
+    
+            if verbose > 0:
+                print ("Book PID: " + bookPID )
+    
+    #
+    #  Decrypt book key
+    #
+    
+        dkey = getBookPayloadRecord('dkey', 0) 
+        
+        bookKeys = []
+        for PID in PIDs :
+            bookKeys+=decryptDkeyRecords(dkey,PID)
+            
+        if len(bookKeys) == 0 :
+            if verbose > 0 :
+                print ("Book key could not be found. Maybe this book is not registered with this device.")
+        else :
+            bookKey = bookKeys[0]
+            if verbose > 0:
+                print("Book key: " + bookKey.encode('hex'))
+                
+            
+                  
+            if command == "printRecord" :
+                extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
+                if outputFile != "" and verbose>0 :
+                    print("Wrote record to file: "+outputFile) 
+            elif command == "doit" :
+                if outdir != "" :
+                    createDecryptedBook(outdir)
+                    if verbose >0 :
+                        print ("Decrypted book saved. Don't pirate!")
+                elif verbose > 0:
+                    print("Output directory name was not supplied.")
+    
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py

new file mode 100644 (file)

index 0000000..86d08d4
--- /dev/null
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -0,0 +1,821 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+# Get a 7 bit encoded number from string. The most 
+# significant byte comes first and has the high bit (8th) set
+
+def readEncodedNumber(file):
+    flag = False
+    c = file.read(1)
+    if (len(c) == 0):
+        return None
+    data = ord(c)
+    
+    if data == 0xFF:
+       flag = True
+       c = file.read(1)
+       if (len(c) == 0):
+           return None
+       data = ord(c)
+       
+    if data >= 0x80:
+        datax = (data & 0x7F)
+        while data >= 0x80 :
+            c = file.read(1)
+            if (len(c) == 0): 
+                return None
+            data = ord(c)
+            datax = (datax <<7) + (data & 0x7F)
+        data = datax 
+    
+    if flag:
+       data = -data
+    return data
+    
+
+# returns a binary string that encodes a number into 7 bits
+# most significant byte first which has the high bit set
+
+def encodeNumber(number):
+   result = ""
+   negative = False
+   flag = 0
+   
+   if number < 0 :
+       number = -number + 1
+       negative = True
+   
+   while True:
+       byte = number & 0x7F
+       number = number >> 7
+       byte += flag
+       result += chr(byte)
+       flag = 0x80
+       if number == 0 : break
+   
+   if negative:
+       result += chr(0xFF)
+   
+   return result[::-1]
+  
+
+
+# create / read  a length prefixed string from the file
+
+def lengthPrefixString(data):
+    return encodeNumber(len(data))+data
+
+def readString(file):
+    stringLength = readEncodedNumber(file)
+    if (stringLength == None):
+        return ""
+    sv = file.read(stringLength)
+    if (len(sv)  != stringLength):
+        return ""
+    return unpack(str(stringLength)+"s",sv)[0]  
+
+ 
+# convert a binary string generated by encodeNumber (7 bit encoded number)
+# to the value you would find inside the page*.dat files to be processed
+
+def convert(i):
+    result = ''
+    val = encodeNumber(i)
+    for j in xrange(len(val)):
+        c = ord(val[j:j+1])
+        result += '%02x' % c
+        return result
+
+
+
+# the complete string table used to store all book text content
+# as well as the xml tokens and values that make sense out of it
+
+class Dictionary(object):
+    def __init__(self, dictFile):
+        self.filename = dictFile
+        self.size = 0
+        self.fo = file(dictFile,'rb')
+        self.stable = []
+        self.size = readEncodedNumber(self.fo)
+        for i in xrange(self.size):
+            self.stable.append(self.escapestr(readString(self.fo)))
+        self.pos = 0
+
+    def escapestr(self, str):
+        str = str.replace('&','&amp;')
+        str = str.replace('<','&lt;')
+        str = str.replace('>','&gt;')
+        str = str.replace('=','&#61;')
+        return str
+
+    def lookup(self,val):
+        if ((val >= 0) and (val < self.size)) :
+            self.pos = val
+            return self.stable[self.pos]
+        else:
+            print "Error - %d outside of string table limits" % val
+            sys.exit(-1)
+
+    def getSize(self):
+        return self.size
+
+    def getPos(self):
+        return self.pos
+
+    def dumpDict(self):
+        for i in xrange(self.size):
+            print "%d %s %s" % (i, convert(i), self.stable[i])
+        return
+
+# parses the xml snippets that are represented by each page*.dat file.
+# also parses the other0.dat file - the main stylesheet
+# and information used to inject the xml snippets into page*.dat files
+
+class PageParser(object):
+    def __init__(self, filename, dict, debug, flat_xml):
+        self.fo = file(filename,'rb')
+        self.id = os.path.basename(filename).replace('.dat','')
+        self.dict = dict
+        self.debug = debug
+        self.flat_xml = flat_xml
+        self.tagpath = []
+        self.doc = []
+        self.snippetList = []
+
+
+    # hash table used to enable the decoding process
+    # This has all been developed by trial and error so it may still have omissions or
+    # contain errors
+    # Format:
+    # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
+
+    token_tags = {
+        'book'         : (1, 'snippets', 1, 0),
+        'version'      : (1, 'snippets', 1, 0),
+        'stylesheet'   : (1, 'snippets', 1, 0),
+        'links'        : (0, 'number', 0, 1),
+        'pages'        : (0, 'number', 0, 1),
+        'page'         : (1, 'snippets', 1, 0),
+        'group'        : (1, 'snippets', 1, 0),
+        'region'       : (1, 'snippets', 1, 0),
+        'reflow'       : (1, 'number', 1, 0),
+        'img'          : (1, 'snippets', 1, 0),
+        'paragraph'    : (1, 'snippets', 1, 0),
+        'extratokens'  : (1, 'snippets', 1, 0),
+        'style'        : (1, 'snippets', 1, 0),
+        'rule'         : (1, 'snippets', 1, 0),
+        '_span'        : (1, 'snippets', 1, 0),
+        'word_semantic': (1, 'snippets', 1, 1),
+        'value'        : (1, 'scalar_text', 0, 0),
+        'h'            : (1, 'scalar_number', 0, 0),
+        'w'            : (1, 'scalar_number', 0, 0),
+        'firstWord'    : (1, 'scalar_number', 0, 0),
+        'lastWord'     : (1, 'scalar_number', 0, 0),
+        'x'            : (1, 'number', 0, 0),
+        'y'            : (1, 'number', 0, 0),
+        'links.page'   : (1, 'number', 0, 0),
+        'link_id'      : (1, 'number', 0, 0),
+        'glyph'        : (0, 'number', 1, 1),
+        'glyph.h'      : (1, 'number', 0, 0),
+        'glyph.w'      : (1, 'number', 0, 0),
+        'sh'           : (1, 'number', 0, 0),
+        'word'         : (0, 'number', 1, 1),
+        'src'          : (1, 'scalar_number', 0, 0),
+        'rel'          : (1, 'number', 0, 0),
+        'row'          : (1, 'number', 0, 0),
+        'startID'      : (1, 'number', 0, 1),
+        'startID.page' : (1, 'number', 0, 0),
+        'glyphID'      : (1, 'number', 0, 0),
+        'rootID'       : (1, 'number', 0, 0),
+        'stemID'       : (1, 'number', 0, 0),
+        'margin-top'   : (1, 'number', 0, 0),
+        'stemPage'     : (1, 'number', 0, 0),
+        'dehyphen'     : (1, 'number', 1, 1),
+        'rootID'       : (1, 'number', 0, 0),
+        'paraCont'     : (1, 'number', 1, 1),
+        'paraStems'    : (1, 'number', 1, 1),
+        'wordStems'    : (1, 'number', 1, 1),
+        'original'     : (0, 'number', 0, 1),
+        'use'          : (1, 'number', 0, 0),
+        'vtx'          : (1, 'number', 0, 1),
+        'len'          : (1, 'number', 0, 1),
+        'dpi'          : (1, 'number', 0, 0),
+        'n'            : (1, 'number', 0, 0),
+        'id'           : (1, 'number', 0, 0),
+        'ref'          : (1, 'number', 0, 0),
+        'pnum'         : (1, 'number', 0, 0),
+        'pid'          : (1, 'text', 0, 0),
+        'info'         : (0, 'number', 1, 0),
+        'bl'           : (1, 'raw', 0, 0),
+        'firstGlyph'   : (1, 'raw', 0, 0),
+        'lastGlyph'    : (1, 'raw', 0, 0),
+        'ocrText'      : (1, 'text', 0, 0),
+        'title'        : (1, 'text', 0, 0),
+        'href'         : (1, 'text', 0, 0),
+        '_parent_type' : (1, 'text', 0, 0),
+        'attr'         : (1, 'scalar_text', 0, 0),
+        'justify'      : (1, 'scalar_text', 0, 0),
+        'align'        : (1, 'scalar_text', 0, 0),
+        'layout'       : (1, 'scalar_text', 0, 0),
+        'pageid'       : (1, 'scalar_text', 0, 0),
+        'pagelabel'    : (1, 'scalar_text', 0, 0),
+        'type'         : (1, 'text', 0, 0),
+        'class'        : (1, 'scalar_text', 0, 0),
+        'container'    : (1, 'scalar_text', 0, 0),
+        '_after_class' : (1, 'scalar_text', 0, 0),
+        '_tag'         : (1, 'scalar_text', 0, 0),
+        'pos'          : (1, 'scalar_text', 0, 0),
+        'page_num'     : (1, 'scalar_text', 0, 0),
+        'page_type'    : (1, 'scalar_text', 0, 0),
+        'findlists'    : (1, 'scalar_text', 0, 0),
+        'FlowEdit_1_id'            : (1, 'scalar_text', 0, 0),
+        'FlowEdit_1_version'       : (1, 'scalar_text', 0, 0),
+        'Schema_id'                : (1, 'scalar_text', 0, 0),
+        'Schema_version'           : (1, 'scalar_text', 0, 0),
+        'Topaz_version'            : (1, 'scalar_text', 0, 0),
+        'WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0),
+        'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
+        'ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0),
+        'ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0),
+        'chapterheaders'           : (1, 'scalar_text', 0, 0),
+        'creation_date'            : (1, 'scalar_text', 0, 0),
+        'header_footer'            : (1, 'scalar_text', 0, 0),
+        'init_from_ocr'            : (1, 'scalar_text', 0, 0),
+        'letter_insertion'         : (1, 'scalar_text', 0, 0),
+        'xmlinj_convert'           : (1, 'scalar_text', 0, 0),
+        'xmlinj_reflow'            : (1, 'scalar_text', 0, 0),
+        'xmlinj_transform'         : (1, 'scalar_text', 0, 0),
+     }
+
+
+    # full tag path record keeping routines
+    def tag_push(self, token):
+        self.tagpath.append(token)
+    def tag_pop(self):
+        if len(self.tagpath) > 0 :
+            self.tagpath.pop()
+    def tagpath_len(self):
+        return len(self.tagpath)
+    def get_tagpath(self, i):
+        cnt = len(self.tagpath)
+        if i < cnt : result = self.tagpath[i]
+        for j in xrange(i+1, cnt) :
+            result += '.' + self.tagpath[j]
+        return result
+            
+
+    # list of absolute command byte values values that indicate
+    # various types of loop meachanisms typically used to generate vectors
+
+    cmd_list = (0x76, 0x76)
+
+    # peek at and return 1 byte that is ahead by i bytes 
+    def peek(self, aheadi):
+        c = self.fo.read(aheadi)
+        if (len(c) == 0):
+            return None
+        self.fo.seek(-aheadi,1)
+        c = c[-1:]
+        return ord(c)
+
+
+    # get the next value from the file being processed
+    def getNext(self):
+        nbyte = self.peek(1);
+        if (nbyte == None):
+            return None
+        val = readEncodedNumber(self.fo)
+        return val
+
+
+    # format an arg by argtype
+    def formatArg(self, arg, argtype):
+        if (argtype == 'text') or (argtype == 'scalar_text') :
+            result = self.dict.lookup(arg)
+        elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
+            result = arg
+        elif (argtype == 'snippets') :
+            result = arg
+        else :
+            print "Error Unknown argtype %s" % argtype
+            sys.exit(-2)
+        return result
+
+
+    # process the next tag token, recursively handling subtags, 
+    # arguments, and commands
+    def procToken(self, token):
+
+        known_token = False
+        self.tag_push(token)
+
+        if self.debug : print 'Processing: ', self.get_tagpath(0)
+        cnt = self.tagpath_len()
+        for j in xrange(cnt):
+            tkn = self.get_tagpath(j)
+            if tkn in self.token_tags :
+                num_args = self.token_tags[tkn][0]
+                argtype = self.token_tags[tkn][1]
+                subtags = self.token_tags[tkn][2]
+                splcase = self.token_tags[tkn][3]
+                ntags = -1
+                known_token = True
+                break
+
+        if known_token :
+
+            # handle subtags if present 
+            subtagres = []
+            if (splcase == 1):
+                # this type of tag uses of escape marker 0x74 indicate subtag count
+                if self.peek(1) == 0x74:
+                    skip = readEncodedNumber(self.fo)
+                    subtags = 1
+                    num_args = 0
+
+            if (subtags == 1): 
+                ntags = readEncodedNumber(self.fo)
+                if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
+                for j in xrange(ntags):
+                    val = readEncodedNumber(self.fo)
+                    subtagres.append(self.procToken(self.dict.lookup(val)))
+
+            # arguments can be scalars or vectors of text or numbers
+            argres = []
+            if num_args > 0 :
+                firstarg = self.peek(1)
+                if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
+                    # single argument is a variable length vector of data
+                    arg = readEncodedNumber(self.fo)
+                    argres = self.decodeCMD(arg,argtype)
+                else :
+                    # num_arg scalar arguments
+                    for i in xrange(num_args):
+                        argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
+
+            # build the return tag
+            result = []
+            tkn = self.get_tagpath(0)
+            result.append(tkn)
+            result.append(subtagres)
+            result.append(argtype)
+            result.append(argres)
+            self.tag_pop()
+            return result
+
+        # all tokens that need to be processed should be in the hash
+        # table if it may indicate a problem, either new token 
+        # or an out of sync condition
+        else:
+            result = []
+            if (self.debug):
+                print 'Unknown Token:', token
+            self.tag_pop()
+            return result
+
+
+    # special loop used to process code snippets
+    # it is NEVER used to format arguments.
+    # builds the snippetList
+    def doLoop72(self, argtype):
+        cnt = readEncodedNumber(self.fo)
+        if self.debug :
+            result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
+            result += 'of the document is indicated by snippet number sets at the\n'
+            result += 'end of each snippet. \n'
+            print result
+        for i in xrange(cnt):
+            if self.debug: print 'Snippet:',str(i)
+            snippet = []
+            snippet.append(i)
+            val = readEncodedNumber(self.fo)
+            snippet.append(self.procToken(self.dict.lookup(val)))
+            self.snippetList.append(snippet)
+        return
+
+
+    # loop: pass though values unchanged
+    # DO NOT CHANGE - this has proven to be correct
+    def doLoop76Mode0(self, argtype, cnt):
+        result = [] 
+        for i in xrange(cnt):
+            result.append(self.formatArg(readEncodedNumber(self.fo), argtype))
+        return result
+
+
+    # loop generating values relative to the *negative* 
+    # of the offset - don't ask why - it just is
+    # DO NOT CHANGE - this has proven to be correct
+    def doLoop76Mode1(self, argtype, cnt):
+        result = []
+        offset = -readEncodedNumber(self.fo)
+        for i in xrange(cnt):
+            val = readEncodedNumber(self.fo) + offset
+            result.append(self.formatArg(val, argtype))
+        return result
+
+
+    # loop generating values with starting value and accumulation
+    # DO NOT CHANGE - this has proven to be the correct
+    def doLoop76Mode2(self, argtype, cnt):
+        result = []
+        ptr = readEncodedNumber(self.fo)
+        result.append(self.formatArg(ptr, argtype))
+        for i in xrange(cnt-1):
+            ptr = ptr + readEncodedNumber(self.fo) 
+            result.append(self.formatArg(ptr, argtype))
+        return result
+
+
+    # loop generating values with starting value and accumulation
+    # **after** subtracting adjustment value from each
+    # DO NOT CHANGE - this has been proven to be correct
+    def doLoop76Mode3(self, argtype, cnt):
+        result = []
+        adj = readEncodedNumber(self.fo)
+        ptr = readEncodedNumber(self.fo)
+        ptr = ptr - adj 
+        result.append(self.formatArg(ptr, argtype))
+        for i in xrange(cnt-1):
+            ptr = ptr + readEncodedNumber(self.fo) - adj
+            result.append(self.formatArg(ptr,argtype))
+        return result
+
+
+    # loop using runing sum of data values and starting value
+    # with accumulation to get new value
+    # Again, don't ask it took me forever to figure this out
+    # DO NOT CHANGE - this has been proven to be correct
+    def doLoop76Mode4(self, argtype, cnt):
+        result = []
+        val = readEncodedNumber(self.fo)
+        runsum = val
+        ptr = val
+        result.append(self.formatArg(ptr, argtype))
+        for i in xrange(cnt-1):
+            runsum += readEncodedNumber(self.fo)
+            ptr = ptr + runsum
+            result.append(self.formatArg(ptr,argtype))
+        return result
+
+
+    # loop using and extra value as an adjustment
+    # and a running sum of the values after subtracting
+    # the adjustment, added to a ptr to get a new pointer
+    def doLoop76Mode5(self, argtype, cnt):
+        result = []
+        adj = readEncodedNumber(self.fo)
+        ptr = 0
+        runsum = 0
+        for i in xrange(cnt):
+            val = readEncodedNumber(self.fo)
+            runsum += (val - adj)
+            ptr = ptr +runsum
+            result.append(self.formatArg(ptr,argtype))
+        return result
+
+
+    # FIXME:  I have only 4 points to work this out with inside my book
+    # So may be wrong but it is correct for my 4 points
+    def doLoop76Mode6(self, argtype, cnt):
+        result = []
+        oldval = 0
+        for i in xrange(cnt):
+            val = readEncodedNumber(self.fo)
+            ptr= (3 * oldval) + val + 1
+            result.append(self.formatArg(ptr,argtype))
+            oldval = val
+        return result
+
+
+
+    # dispatches loop commands bytes with various modes
+    # The 0x76 style loops are used to build vectors
+
+    # This was all derived by trial and error and 
+    # new loop types may exist that are not handled here
+    # since they did not appear in the test cases
+
+    def decodeCMD(self, cmd, argtype):
+
+        # if (cmd == 0x72):
+        #     self.doLoop72(argtype)
+        #     result =[]
+        #     return result
+
+        if (cmd == 0x76):
+            # loop with cnt, and mode to control loop styles
+            cnt = readEncodedNumber(self.fo)
+            mode = readEncodedNumber(self.fo)
+
+            if self.debug : print 'Loop for', cnt, 'with  mode', mode,  ':  '  
+
+            if (mode == 0x00):
+                return self.doLoop76Mode0(argtype, cnt)
+
+            elif (mode == 0x01):
+                return self.doLoop76Mode1(argtype, cnt)
+
+            elif (mode == 0x02):
+                return self.doLoop76Mode2(argtype, cnt)
+
+            elif (mode == 0x03):
+                return self.doLoop76Mode3(argtype, cnt)
+
+            elif (mode == 0x04):
+                return self.doLoop76Mode4(argtype, cnt)
+
+            elif (mode == 0x05):
+                return self.doLoop76Mode5(argtype, cnt)
+
+            elif (mode == 0x06):
+                return self.doLoop76Mode6(argtype, cnt)
+
+            else:
+
+                if self.debug :
+                    # try to mark any unknown loop comands
+                    # if they exist, unless they are used to process
+                    # text or some other known list, we won't be able to prove them correct
+                    print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) 
+                    for i in xrange(cnt):
+                        val = readEncodedNumber(self.fo)
+                        print ' 0x%x' % val,
+                        print ' '
+                result = []
+                return result
+
+        if self.dbug: print  "Unknown command", cmd
+        result = []
+        return result
+            
+    # add full tag path to injected snippets
+    def updateName(self, tag, prefix):
+        name = tag[0]
+        subtagList = tag[1]
+        argtype = tag[2]
+        argList = tag[3]
+        nname = prefix + '.' + name
+        nsubtaglist = []
+        for j in subtagList:
+            nsubtaglist.append(self.updateName(j,prefix))
+        ntag = []
+        ntag.append(nname)
+        ntag.append(nsubtaglist)
+        ntag.append(argtype)
+        ntag.append(argList)
+        return ntag
+
+
+
+    # perform depth first injection of specified snippets into this one
+    def injectSnippets(self, snippet):
+        snipno, tag = snippet
+        name = tag[0]
+        subtagList = tag[1]
+        argtype = tag[2]
+        argList = tag[3]
+        nsubtagList = []
+        if len(argList) > 0 : 
+            for j in argList:
+                asnip = self.snippetList[j]
+                aso, atag = self.injectSnippets(asnip)
+                atag = self.updateName(atag, name)
+                nsubtagList.append(atag)
+        argtype='number'
+        argList=[]
+        if len(nsubtagList) > 0 :
+            subtagList.extend(nsubtagList)
+        tag = []
+        tag.append(name)
+        tag.append(subtagList)
+        tag.append(argtype)
+        tag.append(argList)
+        snippet = []
+        snippet.append(snipno)
+        snippet.append(tag)
+        return snippet
+
+
+
+    # format the tag for output
+    def formatTag(self, node):
+        name = node[0]
+        subtagList = node[1]
+        argtype = node[2]
+        argList = node[3]
+        fullpathname = name.split('.')
+        nodename = fullpathname.pop()
+        ilvl = len(fullpathname)
+        indent = ' ' * (3 * ilvl)
+        result = indent + '<' + nodename + '>'
+        if len(argList) > 0:
+            argres = ''
+            for j in argList:
+                if (argtype == 'text') or (argtype == 'scalar_text') :
+                    argres += j + '|'
+                else :
+                    argres += str(j) + ','
+            argres = argres[0:-1]
+            if argtype == 'snippets' :
+                result += 'snippets:' + argres
+            else :
+                result += argres
+        if len(subtagList) > 0 :
+            result += '\n'
+            for j in subtagList:
+                if len(j) > 0 :
+                    result += self.formatTag(j)
+            result += indent + '</' + nodename + '>\n'
+        else:
+            result += '</' + nodename + '>\n'
+        return result
+
+
+   # flatten tag
+    def flattenTag(self, node):
+        name = node[0]
+        subtagList = node[1]
+        argtype = node[2]
+        argList = node[3]
+        result = name
+        if (len(argList) > 0):
+            argres = ''
+            for j in argList:
+                if (argtype == 'text') or (argtype == 'scalar_text') :
+                    argres += j + '|'
+                else :
+                    argres += str(j) + '|'
+            argres = argres[0:-1]
+            if argtype == 'snippets' :
+                result += '.snippets=' + argres
+            else :
+                result += '=' + argres
+        result += '\n'
+        for j in subtagList:
+            if len(j) > 0 :
+                result += self.flattenTag(j)
+        return result
+
+
+    # reduce create xml output
+    def formatDoc(self, flat_xml):
+        result = ''
+        for j in self.doc :
+            if len(j) > 0:
+                if flat_xml:
+                    result += self.flattenTag(j)
+                else:
+                    result += self.formatTag(j)
+        if self.debug : print result
+        return result
+
+
+
+    # main loop - parse the page.dat files
+    # to create structured document and snippets
+
+    # FIXME: value at end of magic appears to be a subtags count
+    # but for what?  For now, inject an 'info" tag as it is in
+    # every dictionary and seems close to what is meant
+    # The alternative is to special case the last _ "0x5f" to mean something
+
+    def process(self):
+
+        # peek at the first bytes to see what type of file it is
+        magic = self.fo.read(11)
+        if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
+            first_token = 'info'
+        elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
+            skip = self.fo.read(1)
+            first_token = 'info'
+        else :
+            # other0.dat file
+            first_token = None
+            self.fo.seek(-11,1)
+
+
+        # main loop to read and build the document tree
+        while True:
+
+            if first_token != None :
+                # use "inserted" first token 'info' for page and glyph files
+                tag = self.procToken(first_token)
+                if len(tag) > 0 :
+                    self.doc.append(tag)
+                first_token = None
+
+            v = self.getNext()
+            if (v == None): 
+                break
+
+            if (v == 0x72):
+                self.doLoop72('number')
+            elif (v > 0) and (v < self.dict.getSize()) :
+                tag = self.procToken(self.dict.lookup(v))
+                if len(tag) > 0 :
+                    self.doc.append(tag)
+            else:
+                if self.debug:
+                    print "Mina Loop:  Unknown value: %x" % v 
+
+
+        # now do snippet injection
+        if len(self.snippetList) > 0 :
+            if self.debug : print 'Injecting Snippets:'
+            snippet = self.injectSnippets(self.snippetList[0])
+            snipno = snippet[0]
+            tag_add = snippet[1]
+            if self.debug : print self.formatTag(tag_add)
+            if len(tag_add) > 0:
+                self.doc.append(tag_add)
+
+        # handle generation of xml output
+        xmlpage = self.formatDoc(self.flat_xml)
+
+        return xmlpage
+
+
+    
+def usage():
+    print 'Usage: '
+    print '    convert2xml.py dict0000.dat infile.dat '
+    print ' '
+    print ' Options:'
+    print '   -h            print this usage help message '
+    print '   -d            turn on debug output to check for potential errors '
+    print '   --flat-xml    output the flattened xml page description only '
+    print ' '
+    print '     This program will attempt to convert a page*.dat file or '
+    print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
+    print ' '
+    print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
+    print ' the *.dat files from a Topaz format e-book.'
+
+#
+# Main
+#   
+
+def main(argv):
+    dictFile = ""
+    pageFile = ""
+    debug = False
+    flat_xml = False
+    printOutput = False
+    if len(argv) == 0:
+        printOutput = True
+        argv = sys.argv
+    else :
+        argv = argv.split()
+
+    try:
+        opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
+
+    except getopt.GetoptError, err:
+
+        # print help information and exit:
+        print str(err) # will print something like "option -a not recognized"
+        usage()
+        sys.exit(2)
+    
+    if len(opts) == 0 and len(args) == 0 :
+        usage()
+        sys.exit(2) 
+       
+    for o, a in opts:
+        if o =="-d":
+            debug=True
+        if o =="-h":
+            usage()
+            sys.exit(0)
+        if o =="--flat-xml":
+            flat_xml = True
+
+    dictFile, pageFile = args[0], args[1]
+
+    # read in the string table dictionary
+    dict = Dictionary(dictFile)
+
+    # create a page parser
+    pp = PageParser(pageFile, dict, debug, flat_xml)
+
+    xmlpage = pp.process()
+
+    if printOutput:
+        print xmlpage
+        return 0
+
+    return xmlpage
+
+if __name__ == '__main__':
+    sys.exit(main(''))
diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py

new file mode 100644 (file)

index 0000000..f038310
--- /dev/null
+++ b/Topaz_Tools/lib/decode_meta.py
@@ -0,0 +1,109 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+#
+# Get a 7 bit encoded number from string
+#
+
+def readEncodedNumber(file):
+    flag = False
+    c = file.read(1)
+    if (len(c) == 0):
+        return None
+    data = ord(c)
+    
+    if data == 0xFF:
+       flag = True
+       c = file.read(1)
+       if (len(c) == 0):
+           return None
+       data = ord(c)
+       
+    if data >= 0x80:
+        datax = (data & 0x7F)
+        while data >= 0x80 :
+            c = file.read(1)
+            if (len(c) == 0): 
+                return None
+            data = ord(c)
+            datax = (datax <<7) + (data & 0x7F)
+        data = datax 
+    
+    if flag:
+       data = -data
+    return data
+    
+#
+# Encode a number in 7 bit format
+#
+
+def encodeNumber(number):
+   result = ""
+   negative = False
+   flag = 0
+   
+   if number < 0 :
+       number = -number + 1
+       negative = True
+   
+   while True:
+       byte = number & 0x7F
+       number = number >> 7
+       byte += flag
+       result += chr(byte)
+       flag = 0x80
+       if number == 0 : break
+   
+   if negative:
+       result += chr(0xFF)
+   
+   return result[::-1]
+  
+#
+# Get a length prefixed string from the file 
+#
+def lengthPrefixString(data):
+    return encodeNumber(len(data))+data
+
+def readString(file):
+    stringLength = readEncodedNumber(file)
+    if (stringLength == None):
+        return None
+    sv = file.read(stringLength)
+    if (len(sv)  != stringLength):
+        return ""
+    return unpack(str(stringLength)+"s",sv)[0]  
+
+
+
+def getMetaArray(metaFile):
+    # parse the meta file into a Python dictionary (associative array)
+    result = {}
+    fo = file(metaFile,'rb')
+    size = readEncodedNumber(fo)
+    for i in xrange(size):
+        temp = readString(fo)
+        result[temp] = readString(fo)
+    fo.close()
+    return result
+
+
+
+def getMetaData(metaFile):
+    # parse the meta file
+    result = ''    
+    fo = file(metaFile,'rb')
+    size = readEncodedNumber(fo)
+    for i in xrange(size):
+        result += readString(fo) + '|'
+        result += readString(fo) + '\n'
+    fo.close()
+    return result
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py

new file mode 100644 (file)

index 0000000..1a800e8
--- /dev/null
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -0,0 +1,299 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+    def __init__(self, flatxml, fileid):
+        self.id = os.path.basename(fileid).replace('.dat','')
+        self.flatdoc = flatxml.split('\n')
+        self.ocrtext = []
+        self.link_id = []
+        self.link_title = []
+        self.link_page = []
+        self.dehyphen_rootid = []
+        self.paracont_stemid = []
+        self.parastems_stemid = []
+
+
+        
+    # find tag if within pos to end inclusive
+    def findinDoc(self, tagpath, pos, end) :
+        result = None
+        docList = self.flatdoc
+        cnt = len(docList)
+        if end == -1 :
+            end = cnt
+        else:
+            end = min(cnt,end)
+        foundat = -1
+        for j in xrange(pos, end):
+            item = docList[j]
+            if item.find('=') >= 0:
+                (name, argres) = item.split('=')
+            else : 
+                name = item
+                argres = ''
+            if name.endswith(tagpath) : 
+                result = argres
+                foundat = j
+                break
+        return foundat, result
+
+
+    # return list of start positions for the tagpath
+    def posinDoc(self, tagpath):
+        startpos = []
+        pos = 0
+        res = ""
+        while res != None :
+            (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+            if res != None :
+                startpos.append(foundpos)
+            pos = foundpos + 1
+        return startpos
+
+
+    # get a description of the paragraph
+    def getParaDescription(self, start, end):
+        # normal paragraph
+        (pos, pclass) = self.findinDoc('paragraph.class',start,end) 
+
+        # class names are an issue given topaz starts them with numerals (not allowed)
+        # use a mix of cases, (which cause some browsers problems), and actually
+        # attach numbers after "reclustered*" to the end to deal with reflow issues
+        # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered
+        pclass = pclass.lower()
+        pclass = 'cl_' + pclass
+        p = pclass.find('reclustered')
+        if p > 0 : pclass = pclass[0:p+11]
+
+        (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
+        (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+        if (sfirst != None) and (slast != None) :
+            return pclass, int(sfirst), int(slast)
+
+        # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well
+        # so walk through this region keeping track of the first firstword, and the last lastWord
+        # on any items that have it
+        (pos, sfirst) = self.findinDoc('firstWord',start, end)
+        first = int(sfirst)
+        last = -1
+        for i in xrange(pos+1,end):
+            (pos, slast) = self.findinDoc('lastWord',i,i+1)
+            if slast != None:
+                last = int(slast)
+        return pclass, first, last
+
+
+    def buildParagraph(self, cname, first, last, type, regtype) :
+        parares = ''
+        sep =''
+        br_lb = False
+        if (regtype == 'fixed') or (regtype == 'chapterheading') :
+            br_lb = True
+        handle_links = False
+        if len(self.link_id) > 0:
+            handle_links = True
+        if (type == 'full') or (type == 'begin') :
+            parares += '<p class="' + cname + '">'
+        if (type == 'end'):
+            parares += ' '
+        for j in xrange(first, last) :
+            word = self.ocrtext[j]
+            sep = ' '
+
+            if handle_links:
+                link = self.link_id[j]
+                if (link > 0): 
+                    title = self.link_title[link-1]
+                    if title == "": title='_link_'
+                    ptarget = self.link_page[link-1] - 1
+                    linkhtml = '<a href="#page%04d">' % ptarget
+                    linkhtml += title + '</a>'
+                    pos = parares.rfind(title)
+                    if pos >= 0:
+                        parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
+                    else :
+                        parares += linkhtml
+                    if word == '_link_' : word = ''
+                elif (link < 0) :
+                    if word == '_link_' : word = ''
+
+            if word == '_lb_':
+                if (j-1) in self.dehyphen_rootid :
+                    word = ''
+                    sep = ''
+                elif handle_links :
+                    word = ''
+                    sep = ''
+                elif br_lb :
+                    word = '<br />\n'
+                    sep = ''
+                else :
+                    word = '\n'
+                    sep = ''
+
+            if j in self.dehyphen_rootid :
+                word = word[0:-1]
+                sep = ''
+
+            parares += word + sep
+
+        if len(sep) > 0 : parares = parares[0:-1]
+        if (type == 'full') or (type == 'end') :
+            parares += '</p>'
+        return parares
+
+
+    
+    # walk the document tree collecting the information needed
+    # to build an html page using the ocrText
+
+    def process(self):
+
+        htmlpage = ''
+
+        # first collect information from the xml doc that describes this page
+        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
+        if argres :  self.ocrtext = argres.split('|')
+
+        (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
+        if argres: 
+            argList = argres.split('|')
+            self.dehyphen_rootid = [ int(strval) for strval in argList]
+
+        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
+        if self.parastems_stemid == None : self.parastems_stemid = []
+ 
+        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
+        if self.paracont_stemid == None : self.paracont_stemid = []
+
+
+        (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
+        if argres:
+            argList = argres.split('|')
+            self.link_id = [ int(strval) for strval in argList]
+
+        (pos, argres) = self.findinDoc('info.links.page',0,-1)
+        if argres :
+            argList = argres.split('|')
+            self.link_page = [ int(strval) for strval in argList]
+
+        (pos, argres) = self.findinDoc('info.links.title',0,-1)
+        if argres :
+            self.link_title = argres.split('|')
+        else:
+            self.link_title.append('')
+
+        (pos, pagetype) = self.findinDoc('page.type',0,-1)
+
+
+        # generate a list of each region starting point
+        # each region has one paragraph,, or one image, or one chapterheading
+        regionList= self.posinDoc('region')
+        regcnt = len(regionList)
+        regionList.append(-1)
+
+        anchorSet = False
+        breakSet = False
+
+        # process each region tag and convert what you can to html
+
+        for j in xrange(regcnt):
+            start = regionList[j]
+            end = regionList[j+1]
+
+            (pos, regtype) = self.findinDoc('region.type',start,end)
+
+            if regtype == 'graphic' :
+                if not anchorSet:
+                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                    anchorSet = True
+                (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                if simgsrc:
+                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
+            
+            elif regtype == 'chapterheading' :
+                (pclass, first, last) = self.getParaDescription(start,end)
+                if not breakSet:
+                    htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
+                    breakSet = True
+                if not anchorSet:
+                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                    anchorSet = True
+                tag = 'h1'
+                if pclass[3:7] == 'ch1-' : tag = 'h1'
+                if pclass[3:7] == 'ch2-' : tag = 'h2'
+                if pclass[3:7] == 'ch3-' : tag = 'h3'
+                htmlpage += '<' + tag + ' class="' + pclass + '">'
+                htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype)
+                htmlpage += '</' + tag + '>'
+
+            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
+                ptype = 'full'
+                # check to see if this is a continution from the previous page
+                if (len(self.parastems_stemid) > 0):
+                    ptype = 'end'
+                    self.parastems_stemid=[]
+                else:
+                    if not anchorSet:
+                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                        anchorSet = True
+                (pclass, first, last) = self.getParaDescription(start,end)
+                if ptype == 'full' :
+                    tag = 'p'
+                    if pclass[3:6] == 'h1-' : tag = 'h4'
+                    if pclass[3:6] == 'h2-' : tag = 'h5'
+                    if pclass[3:6] == 'h3-' : tag = 'h6'
+                    htmlpage += '<' + tag + ' class="' + pclass + '">'
+                    htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype)
+                    htmlpage += '</' + tag + '>'
+                else :
+                    htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+
+
+            elif (regtype == 'tocentry') :
+                ptype = 'full'
+                # check to see if this is a continution from the previous page
+                if (len(self.parastems_stemid) > 0) and (j == 0):
+                    # process the first paragraph as a continuation from the last page
+                    ptype = 'end'
+                    self.parastems_stemid = []
+                else:
+                    if not anchorSet:
+                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                        anchorSet = True
+                (pclass, first, last) = self.getParaDescription(start,end)
+                htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+
+            else :
+                print 'Unknown region type', regtype
+                print 'Warning: skipping this region'
+
+        if len(self.paracont_stemid) > 0 :
+            if htmlpage[-4:] == '</p>':
+                htmlpage = htmlpage[0:-4]    
+
+        return htmlpage
+
+
+        return self.convert2HTML()
+
+
+
+def convert2HTML(flatxml, fileid):
+
+    # create a document parser
+    dp = DocParser(flatxml, fileid)
+
+    htmlpage = dp.process()
+
+    return htmlpage
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py

new file mode 100644 (file)

index 0000000..be50aae
--- /dev/null
+++ b/Topaz_Tools/lib/genhtml.py
@@ -0,0 +1,125 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+import stylexml2css
+
+
+def usage():
+    print 'Usage: '
+    print ' '
+    print '   genhtml.py unencryptedBookDir'
+    print '  '
+
+
+
+def main(argv):
+    bookDir = ''
+
+    if len(argv) == 0:
+        argv = sys.argv
+    else :
+        argv = argv.split()
+
+    try:
+        opts, args = getopt.getopt(argv[1:], "h:")
+
+    except getopt.GetoptError, err:
+        print str(err)
+        usage()
+        sys.exit(2)
+    
+    if len(opts) == 0 and len(args) == 0 :
+        usage()
+        sys.exit(2) 
+       
+    for o, a in opts:
+        if o =="-h":
+            usage()
+            sys.exit(0)
+
+    bookDir = args[0]
+
+    if not os.path.exists(bookDir) :
+        print "Can not find directory with unencrypted book"
+        sys.exit(-1)
+
+    dictFile = os.path.join(bookDir,'dict0000.dat')
+
+    if not os.path.exists(dictFile) :
+        print "Can not find dict0000.dat file"
+        sys.exit(-1)
+
+    pageDir = os.path.join(bookDir,'page')
+    if not os.path.exists(pageDir) :
+        print "Can not find page directory in unencrypted book"
+        sys.exit(-1)
+
+    imgDir = os.path.join(bookDir,'img')
+    if not os.path.exists(imgDir) :
+        print "Can not find image directory in unencrypted book"
+        sys.exit(-1)
+
+    otherFile = os.path.join(bookDir,'other0000.dat')
+    if not os.path.exists(otherFile) :
+        print "Can not find other0000.dat in unencrypted book"
+        sys.exit(-1)
+
+    metaFile = os.path.join(bookDir,'metadata0000.dat')
+    if not os.path.exists(metaFile) :
+        print "Can not find metadata0000.dat in unencrypted book"
+        sys.exit(-1)
+
+
+    htmlFileName = "book.html"
+    htmlstr = '<html>\n'
+
+    filenames = os.listdir(pageDir)
+    filenames = sorted(filenames)
+
+    print 'Processing ... '
+
+    htmlstr += '<head>\n'
+
+    print '     ', 'metadata0000.dat'
+    fname = os.path.join(bookDir,'metadata0000.dat')
+    xname = os.path.join(bookDir, 'metadata.txt')
+    metastr = decode_meta.getMetaData(fname)
+    file(xname, 'wb').write(metastr)
+    meta_array = decode_meta.getMetaArray(fname)
+    htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+    htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+
+    print '     ', 'other0000.dat'
+    fname = os.path.join(bookDir,'other0000.dat')
+    xname = os.path.join(bookDir, 'style.css')
+    xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+    cssstr = '<style>\n'
+    cssstr += stylexml2css.convert2CSS(xmlstr)
+    cssstr += '</style>\n'
+    file(xname, 'wb').write(cssstr)
+    htmlstr += cssstr
+    htmlstr += '</head>\n<body>\n'
+
+    for filename in filenames:
+        print '     ', filename
+        fname = os.path.join(pageDir,filename)
+        flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
+        htmlstr += flatxml2html.convert2HTML(flat_xml, fname)
+
+    htmlstr += '</body>\n</html>\n'
+
+    file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+    print 'Processing Complete'
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main(''))
+
+
diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py

new file mode 100644 (file)

index 0000000..7df8043
--- /dev/null
+++ b/Topaz_Tools/lib/gensvg.py
@@ -0,0 +1,295 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+
+
+class GParser(object):
+   def __init__(self, flatxml):
+       self.flatdoc = flatxml.split('\n')
+       self.dpi = 1440
+       self.gh = self.getData('info.glyph.h')
+       self.gw = self.getData('info.glyph.w')
+       self.guse = self.getData('info.glyph.use')
+       self.count = len(self.guse)
+       self.gvtx = self.getData('info.glyph.vtx')
+       self.glen = self.getData('info.glyph.len')
+       self.gdpi = self.getData('info.glyph.dpi')
+       self.vx = self.getData('info.vtx.x')
+       self.vy = self.getData('info.vtx.y')
+       self.vlen = self.getData('info.len.n')
+       self.glen.append(len(self.vlen))
+       self.gvtx.append(len(self.vx))
+
+   def getData(self, path):
+       result = None
+       cnt = len(self.flatdoc)
+       for j in xrange(cnt):
+           item = self.flatdoc[j]
+           if item.find('=') >= 0:
+               (name, argt) = item.split('=')
+               argres = argt.split('|')
+           else:
+               name = item
+               argres = []
+           if (name == path):
+               result = argres
+               break
+       if (len(argres) > 0) :
+           for j in xrange(0,len(argres)):
+               argres[j] = int(argres[j])
+       return result
+
+   def getPath(self, gly):
+       path = ''
+       if (gly < 0) or (gly >= self.count):
+           return path
+       tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
+       ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
+       p = 0
+       for k in xrange(self.glen[gly], self.glen[gly+1]):
+           if (p == 0):
+               zx = tx[0:self.vlen[k]+1]
+               zy = ty[0:self.vlen[k]+1]
+           else:
+               zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
+               zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
+           p += 1
+           for j in xrange(0, len(zx)):
+               if (j == 0):
+                   path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
+               else:
+                   path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
+       path += 'z'
+       return path
+
+class PParser(object):
+   def __init__(self, flatxml):
+       self.flatdoc = flatxml.split('\n')
+       self.temp = []
+       self.ph = self.getData('page.h')[0]
+       self.pw = self.getData('page.w')[0]
+       self.gx = self.getData('info.glyph.x')
+       self.gy = self.getData('info.glyph.y')
+       self.gid = self.getData('info.glyph.glyphID')
+
+   def getData(self, path):
+       result = None
+       cnt = len(self.flatdoc)
+       for j in xrange(cnt):
+           item = self.flatdoc[j]
+           if item.find('=') >= 0:
+               (name, argt) = item.split('=')
+               argres = argt.split('|')
+           else:
+               name = item
+               argres = []
+           if (name.endswith(path)):
+               result = argres
+               break
+       if (len(argres) > 0) :
+           for j in xrange(0,len(argres)):
+               argres[j] = int(argres[j])
+       return result
+
+   def getDataTemp(self, path):
+       result = None
+       cnt = len(self.temp)
+       for j in xrange(cnt):
+           item = self.temp[j]
+           if item.find('=') >= 0:
+               (name, argt) = item.split('=')
+               argres = argt.split('|')
+           else:
+               name = item
+               argres = []
+           if (name.endswith(path)):
+               result = argres
+               self.temp.pop(j)
+               break
+       if (len(argres) > 0) :
+           for j in xrange(0,len(argres)):
+               argres[j] = int(argres[j])
+       return result
+
+   def getImages(self):
+       result = []
+       self.temp = self.flatdoc
+       while (self.getDataTemp('region.img') != None):
+           h = self.getDataTemp('region.img.h')[0]
+           w = self.getDataTemp('region.img.w')[0]
+           x = self.getDataTemp('region.img.x')[0]
+           y = self.getDataTemp('region.img.y')[0]
+           src = self.getDataTemp('region.img.src')[0]
+           result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
+       return result
+
+   def getGlyphs(self,glyfname):
+       result = []
+       if (self.gid != None) and (len(self.gid) > 0):
+           glyphs = []
+           for j in set(self.gid):
+               glyphs.append(j)
+           glyphs.sort()
+           gfile = open(glyfname, 'r')
+           j = 0
+           while True :
+               inp = gfile.readline()
+               if (inp == ''):
+                   break
+               id='id="gl%d"' % glyphs[j]
+               if (inp.find(id) > 0):
+                   result.append(inp)
+                   j += 1
+                   if (j == len(glyphs)):
+                       break
+           gfile.close()
+       return result
+
+
+
+
+def usage():
+   print 'Usage: '
+   print ' '
+   print '   gensvg.py unencryptedBookDir'
+   print '  '
+
+
+def main(argv):
+   bookDir = ''
+
+   if len(argv) == 0:
+       argv = sys.argv
+   else :
+       argv = argv.split()
+
+   try:
+       opts, args = getopt.getopt(argv[1:], "h:")
+
+   except getopt.GetoptError, err:
+       print str(err)
+       usage()
+       sys.exit(2)
+
+   if len(opts) == 0 and len(args) == 0 :
+       usage()
+       sys.exit(2) 
+
+   for o, a in opts:
+       if o =="-h":
+           usage()
+           sys.exit(0)
+
+   bookDir = args[0]
+
+   if not os.path.exists(bookDir) :
+       print "Can not find directory with unencrypted book"
+       sys.exit(-1)
+
+   dictFile = os.path.join(bookDir,'dict0000.dat')
+
+   if not os.path.exists(dictFile) :
+       print "Can not find dict0000.dat file"
+       sys.exit(-1)
+
+   pageDir = os.path.join(bookDir,'page')
+   if not os.path.exists(pageDir) :
+       print "Can not find page directory in unencrypted book"
+       sys.exit(-1)
+
+   imgDir = os.path.join(bookDir,'img')
+   if not os.path.exists(imgDir) :
+       print "Can not find image directory in unencrypted book"
+       sys.exit(-1)
+
+   glyphsDir = os.path.join(bookDir,'glyphs')
+   if not os.path.exists(glyphsDir) :
+       print "Can not find glyphs directory in unencrypted book"
+       sys.exit(-1)
+
+   metaFile = os.path.join(bookDir,'metadata0000.dat')
+   if not os.path.exists(metaFile) :
+       print "Can not find metadata0000.dat in unencrypted book"
+       sys.exit(-1)
+
+   svgDir = os.path.join(bookDir,'svg')
+   if not os.path.exists(svgDir) :
+       os.makedirs(svgDir)
+
+
+   print 'Processing Meta Data ... '
+
+   print '     ', 'metadata0000.dat'
+   fname = os.path.join(bookDir,'metadata0000.dat')
+   metadata = decode_meta.getMetaArray(fname)
+
+   print 'Processing Glyphs ... '
+
+   filenames = os.listdir(glyphsDir)
+   filenames = sorted(filenames)
+
+   glyfname = os.path.join(svgDir,'glyphs.svg')
+   glyfile = open(glyfname, 'w')
+   glyfile.write('<?xml version="1.0" standalone="no"?>\n')
+   glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
+   glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
+   glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
+   glyfile.write('<defs>\n')
+   counter = 0
+   for filename in filenames:
+       print '     ', filename
+       fname = os.path.join(glyphsDir,filename)
+       flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
+       gp = GParser(flat_xml)
+       for i in xrange(0, gp.count):
+           path = gp.getPath(i)
+           glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
+       counter += 1
+   glyfile.write('</defs>\n')
+   glyfile.write('</svg>\n')
+   glyfile.close()
+
+   print 'Processing Pages ... '
+
+   scaledpi = 720
+   filenames = os.listdir(pageDir)
+   filenames = sorted(filenames)
+   counter = 0
+   for filename in filenames:
+       print '     ', filename
+       fname = os.path.join(pageDir,filename)
+       flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) 
+       pp = PParser(flat_xml)
+       pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
+       pfile.write('<?xml version="1.0" standalone="no"?>\n')
+       pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
+       pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
+       pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
+       if (pp.gid != None): 
+           pfile.write('<defs>\n')
+           gdefs = pp.getGlyphs(glyfname)
+           for j in xrange(0,len(gdefs)):
+               pfile.write(gdefs[j])
+           pfile.write('</defs>\n')
+           for j in xrange(0,len(pp.gid)):
+               pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
+       img = pp.getImages()
+       if (img != None):
+           for j in xrange(0,len(img)):
+               pfile.write(img[j])
+       pfile.write('</svg>')
+       pfile.close()
+       counter += 1
+
+   print 'Processing Complete'
+
+   return 0
+
+if __name__ == '__main__':
+   sys.exit(main(''))
diff --git a/Topaz_Tools/lib/genxml.py b/Topaz_Tools/lib/genxml.py

new file mode 100644 (file)

index 0000000..c335e88
--- /dev/null
+++ b/Topaz_Tools/lib/genxml.py
@@ -0,0 +1,121 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+import os, sys, getopt
+
+# local routines
+import convert2xml
+import flatxml2html
+import decode_meta
+
+
+def usage():
+    print 'Usage: '
+    print ' '
+    print '   genxml.py dict0000.dat unencryptedBookDir'
+    print '  '
+
+
+
+def main(argv):
+    bookDir = ''
+
+    if len(argv) == 0:
+        argv = sys.argv
+    else :
+        argv = argv.split()
+
+    try:
+        opts, args = getopt.getopt(argv[1:], "h:")
+
+    except getopt.GetoptError, err:
+        print str(err)
+        usage()
+        sys.exit(2)
+    
+    if len(opts) == 0 and len(args) == 0 :
+        usage()
+        sys.exit(2) 
+       
+    for o, a in opts:
+        if o =="-h":
+            usage()
+            sys.exit(0)
+
+    bookDir = args[0]
+
+    if not os.path.exists(bookDir) :
+        print "Can not find directory with unencrypted book"
+        sys.exit(-1)
+
+    dictFile = os.path.join(bookDir,'dict0000.dat')
+    if not os.path.exists(dictFile) :
+        print "Can not find dict0000.dat file"
+        sys.exit(-1)
+
+    pageDir = os.path.join(bookDir,'page')
+    if not os.path.exists(pageDir) :
+        print "Can not find page directory in unencrypted book"
+        sys.exit(-1)
+
+    glyphsDir = os.path.join(bookDir,'glyphs')
+    if not os.path.exists(glyphsDir) :
+        print "Can not find glyphs directory in unencrypted book"
+        sys.exit(-1)
+
+    otherFile = os.path.join(bookDir,'other0000.dat')
+    if not os.path.exists(otherFile) :
+        print "Can not find other0000.dat in unencrypted book"
+        sys.exit(-1)
+
+    metaFile = os.path.join(bookDir,'metadata0000.dat')
+    if not os.path.exists(metaFile) :
+        print "Can not find metadata0000.dat in unencrypted book"
+        sys.exit(-1)
+
+    xmlDir = os.path.join(bookDir,'xml')
+    if not os.path.exists(xmlDir):
+        os.makedirs(xmlDir)
+
+
+    print 'Processing ... '
+
+    print '     ', 'metadata0000.dat'
+    fname = os.path.join(bookDir,'metadata0000.dat')
+    xname = os.path.join(xmlDir, 'metadata.txt')
+    metastr = decode_meta.getMetaData(fname)
+    file(xname, 'wb').write(metastr)
+
+    print '     ', 'other0000.dat'
+    fname = os.path.join(bookDir,'other0000.dat')
+    xname = os.path.join(xmlDir, 'stylesheet.xml')
+    xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+    file(xname, 'wb').write(xmlstr)
+    
+    filenames = os.listdir(pageDir)
+    filenames = sorted(filenames)
+
+    for filename in filenames:
+        print '     ', filename
+        fname = os.path.join(pageDir,filename)
+        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
+        xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+        file(xname, 'wb').write(xmlstr)
+
+    filenames = os.listdir(glyphsDir)
+    filenames = sorted(filenames)
+
+    for filename in filenames:
+        print '     ', filename
+        fname = os.path.join(glyphsDir,filename)
+        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
+        xmlstr = convert2xml.main('convert2xml.py ' + dictFile + ' ' + fname)
+        file(xname, 'wb').write(xmlstr)
+ 
+
+    print 'Processing Complete'
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main(''))
diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt

new file mode 100644 (file)

index 0000000..4a79d20
--- /dev/null
+++ b/Topaz_Tools/lib/readme.txt
@@ -0,0 +1,75 @@
+This is experimental and it will probably not work for you but...
+
+ALSO:  Please do not use any of this to steal.  Theft is wrong. 
+       This is meant to allow conversion of Topaz books for other book readers you own
+
+Here are the steps:
+
+1. Unzip the topazscripts.zip file to get the full set of python scripts.
+The files you should have after unzipping are:
+
+cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
+decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
+convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
+flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
+stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
+genxml.py - main program to convert everything to xml
+genhtml.py - main program to generate "book.html"
+gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
+
+Please note, gensvg.py, genhtml.py, and genxml.py import and use
+decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py 
+so please keep all of these python scripts together in the same place.
+
+
+
+2. Remove the DRM from the Topaz book and build a directory 
+of its contents as files
+
+All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else 
+would be possible
+
+   cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
+
+This should create a directory called "TARGETDIR" in your current directory.  
+It should have the following files in it:
+
+metadata0000.dat - metadata info
+other0000.dat - information used to create a style sheet
+dict0000.dat - dictionary of words used to build page descriptions
+page - directory filled with page*.dat files
+glyphs - directory filled with glyphs*.dat files
+
+
+
+3. Convert the files in "TARGETDIR" to their xml descriptions
+which can be found in TARGETDIR/xml/ upon completion.
+
+   genxml.py TARGETDIR
+
+
+
+4. Create book.html which can be found in "TARGETDIR" after 
+completion.  This html conversion can not fully capture 
+all of the layouts actually used in the book and needs to 
+be edited to include special font handling such as bold 
+or italics that can not be determined from the ocrText
+information or the style information.  If you want to 
+see things exactly as they were, see step 5 below.
+
+   genhtml.py TARGETDIR
+
+
+
+5. Create an svg description of each page which can
+be found in TARGETDIR/svg/ upon completion.
+
+All thanks go to CLARKNOVA for this program.  This program is 
+needed to actually see the true image of each page so that hand
+editing of the html created by step 4 can be done.  
+
+Or use the resulting svg files to read each page of the book
+exactly as it has been laid out originally.
+
+   gensvg.py TARGETDIR
+
diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py

new file mode 100644 (file)

index 0000000..cf02984
--- /dev/null
+++ b/Topaz_Tools/lib/stylexml2css.py
@@ -0,0 +1,221 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+    def __init__(self, flatxml):
+        self.flatdoc = flatxml.split('\n')
+
+    stags = {
+        'paragraph' : 'p',
+        'graphic'   : '.graphic'
+    }
+
+    attr_val_map = {
+        'hang'            : ('text-indent: ', 135),
+        'indent'          : ('text-indent: ', 135),
+        'line-space'      : ('line-height: ', 190),
+        'margin-bottom'   : ('margin-bottom: ', 135),
+        'margin-left'     : ('margin-left: ', 135),
+        'margin-right'    : ('margin-right: ', 135),
+        'margin-top'      : ('margin-top: ', 135),
+        'space-after'     : ('padding-bottom: ', 135),
+    }
+
+    attr_str_map = {
+        'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+        'align-left'   : 'text-align: left;',
+        'align-right'  : 'text-align: right;',
+        'align-justify' : 'text-align: justify;',
+        'display-inline' : 'display: inline;',
+        'pos-left' : 'text-align: left;',
+        'pos-right' : 'text-align: right;',
+        'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
+    }
+    
+    
+    # find tag if within pos to end inclusive
+    def findinDoc(self, tagpath, pos, end) :
+        result = None
+        docList = self.flatdoc
+        cnt = len(docList)
+        if end == -1 :
+            end = cnt
+        else:
+            end = min(cnt,end)
+        foundat = -1
+        for j in xrange(pos, end):
+            item = docList[j]
+            if item.find('=') >= 0:
+                (name, argres) = item.split('=')
+            else : 
+                name = item
+                argres = ''
+            if name.endswith(tagpath) : 
+                result = argres
+                foundat = j
+                break
+        return foundat, result
+
+
+    # return list of start positions for the tagpath
+    def posinDoc(self, tagpath):
+        startpos = []
+        pos = 0
+        res = ""
+        while res != None :
+            (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+            if res != None :
+                startpos.append(foundpos)
+            pos = foundpos + 1
+        return startpos
+
+
+    def process(self):
+
+        csspage = ''
+
+        # generate a list of each <style> starting point in the stylesheet
+        styleList= self.posinDoc('book.stylesheet.style')
+        stylecnt = len(styleList)
+        styleList.append(-1)
+
+        # process each style converting what you can
+
+        for j in xrange(stylecnt):
+            start = styleList[j]
+            end = styleList[j+1]
+
+            (pos, tag) = self.findinDoc('style._tag',start,end)
+            if tag == None :
+                (pos, tag) = self.findinDoc('style.type',start,end)
+                
+            # Is this something we know how to convert to css
+            if tag in self.stags :
+
+                # get the style class
+                (pos, sclass) = self.findinDoc('style.class',start,end)
+                if sclass != None:
+                    sclass = '.cl_' + sclass.lower()
+                else : 
+                    sclass = ''
+
+                # check for any "after class" specifiers
+                (pos, aftclass) = self.findinDoc('style._after_class',start,end)
+                if aftclass != None:
+                    aftclass = '.cl_' + aftclass.lower()
+                else : 
+                    aftclass = ''
+
+                cssargs = {}
+
+                while True :
+
+                    (pos, attr) = self.findinDoc('style.rule.attr', start, end)
+                    (pos, val) = self.findinDoc('style.rule.value', start, end)
+
+                    if attr == None : break
+                    
+                    if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
+                        # handle text based attributess
+                        attr = attr + '-' + val
+                        if attr in self.attr_str_map :
+                            cssargs[attr] = (self.attr_str_map[attr], '')
+                    else :
+                        # handle value based attributes
+                        if attr in self.attr_val_map :
+                            (name, scale) = self.attr_val_map[attr]
+                            if not ((attr == 'hang') and (int(val) == 0)) :
+                                ems = int(val)/scale
+                                cssargs[attr] = (self.attr_val_map[attr][0], ems)
+                                keep = True
+
+                    start = pos + 1
+
+                # disable all of the after class tags until I figure out how to handle them
+                # remove all numerals after the "reclustered" 
+
+                if aftclass != "" : keep = False
+
+                p = sclass.find('reclustered') 
+                if p >= 0:
+                    sclass = sclass[0:p+11]
+
+                if keep :
+                    # make sure line-space does not go below 1em
+                    if 'line-space' in cssargs:
+                        seg = cssargs['line-space'][0]
+                        val = cssargs['line-space'][1]
+                        if val < 1.0: val = 1.0
+                        del cssargs['line-space']
+                        cssargs['line-space'] = (self.attr_val_map['line-space'][0], val)
+
+
+                    
+                    # handle modifications for css style hanging indents
+                    if 'hang' in cssargs:
+                        hseg = cssargs['hang'][0]
+                        hval = cssargs['hang'][1]
+                        del cssargs['hang']
+                        cssargs['hang'] = (self.attr_val_map['hang'][0], -hval)
+                        mval = 0
+                        mseg = 'margin-left: '
+                        if 'margin-left' in cssargs:
+                            mseg = cssargs['margin-left'][0]
+                            mval = cssargs['margin-left'][1]
+                            mval = hval + mval
+                            cssargs['margin-left'] = (mseg, mval)
+                        if 'indent' in cssargs:
+                            del cssargs['indent']
+
+                    cssline = sclass + ' { '
+                    for key in iter(cssargs):
+                        mseg = cssargs[key][0]
+                        mval = cssargs[key][1]
+                        if mval == '':
+                            cssline += mseg + ' '
+                        else :
+                            aseg = mseg + '%.1fem;' % mval
+                            cssline += aseg + ' '
+
+                    cssline += '}'
+
+                    # handle special case of paragraph class used inside chapter heading
+                    # and non-chapter headings
+                    if sclass != '' :
+                        ctype = sclass[4:7]
+                        if ctype == 'ch1' :
+                            csspage += 'h1' + cssline + '\n'
+                        if ctype == 'ch2' :
+                            csspage += 'h2' + cssline + '\n'
+                        if ctype == 'ch3' :
+                            csspage += 'h3' + cssline + '\n'
+                        if ctype == 'h1-' :
+                            csspage += 'h4' + cssline + '\n'
+                        if ctype == 'h2-' :
+                            csspage += 'h5' + cssline + '\n'
+                        if ctype == 'h3_' :
+                            csspage += 'h6' + cssline + '\n'
+
+                    csspage += self.stags[tag] + cssline + '\n'
+        
+        return csspage
+
+
+
+def convert2CSS(flatxml):
+
+    # create a document parser
+    dp = DocParser(flatxml)
+
+    csspage = dp.process()
+
+    return csspage
author	some_updates <some_updates@gmail.com>
	Sun, 17 Jan 2010 12:10:35 +0000 (12:10 +0000)
committer	Apprentice Alf <apprenticealf@gmail.com>
	Sat, 28 Feb 2015 12:11:14 +0000 (12:11 +0000)
Topaz_Tools/lib/cmbtc_dump.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/convert2xml.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/decode_meta.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/flatxml2html.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/genhtml.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/gensvg.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/genxml.py	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/readme.txt	[new file with mode: 0644]	patch \| blob
Topaz_Tools/lib/stylexml2css.py	[new file with mode: 0644]	patch \| blob