description = 'Removes DRM from Mobipocket, Kindle/Mobi, Kindle/Topaz and Kindle/Print Replica files. Provided by the work of many including DiapDealer, SomeUpdates, IHeartCabbages, CMBDTC, Skindle, DarkReverser, ApprenticeAlf, etc.'
supported_platforms = ['osx', 'windows', 'linux'] # Platforms this plugin will run on
author = 'DiapDealer, SomeUpdates' # The author of this plugin
- version = (0, 3, 7) # The version number of this plugin
+ version = (0, 3, 8) # The version number of this plugin
file_types = set(['prc','mobi','azw','azw1','azw4','tpz']) # The file types that this plugin will be applied to
on_import = True # Run this plugin during the import
priority = 210 # run this plugin before mobidedrm, k4pcdedrm, k4dedrm
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside of string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
- '-span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.gridSize' : (1, 'scalar_number', 0, 0),
+ '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
+ 'span.gridSize' : (1, 'scalar_number', 0, 0),
+ 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
pclass = self.getClass(pclass)
+ # if paragraph uses extratokens (extra glyphs) then make it fixed
+ (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
+ makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
word_class = ''
+ word_semantic_type = ''
+
while (line < end) :
(name, argres) = self.lineinDoc(line)
return parares
+ def buildTOCEntry(self, pdesc) :
+ parares = ''
+ sep =''
+ tocentry = ''
+ handle_links = len(self.link_id) > 0
+
+ lstart = 0
+
+ cnt = len(pdesc)
+ for j in xrange( 0, cnt) :
+
+ (wtype, num) = pdesc[j]
+
+ if wtype == 'ocr' :
+ word = self.ocrtext[num]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[num]
+ if (link > 0):
+ linktype = self.link_type[link-1]
+ title = self.link_title[link-1]
+ title = title.rstrip('. ')
+ alt_title = parares[lstart:]
+ alt_title = alt_title.strip()
+ # now strip off the actual printed page number
+ alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
+ alt_title = alt_title.rstrip('. ')
+ # skip over any external links - can't have them in a books toc
+ if linktype == 'external' :
+ title = ''
+ alt_title = ''
+ linkpage = ''
+ else :
+ if len(self.link_page) >= link :
+ ptarget = self.link_page[link-1] - 1
+ linkpage = '%04d' % ptarget
+ else :
+ # just link to the current page
+ linkpage = self.id[4:]
+ if len(alt_title) >= len(title):
+ title = alt_title
+ if title != '' and linkpage != '':
+ tocentry += title + '|' + linkpage + '\n'
+ lstart = len(parares)
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ word = ''
+ sep = ''
+
+ if num in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ else :
+ continue
+
+ return tocentry
+
+
+
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
+ tocinfo = ''
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
+ tocinfo += self.buildTOCEntry(pdesc)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full'
if inGroup:
htmlpage = htmlpage[0:-4]
last_para_continued = False
- return htmlpage
-
+ return htmlpage, tocinfo
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
# create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
- htmlpage = dp.process()
- return htmlpage
+ htmlpage, tocinfo = dp.process()
+ return htmlpage, tocinfo
class PParser(object):
- def __init__(self, gd, flatxml):
+ def __init__(self, gd, flatxml, meta_array):
self.gd = gd
self.flatdoc = flatxml.split('\n')
+ self.docSize = len(self.flatdoc)
self.temp = []
- foo = self.getData('page.h') or self.getData('book.h')
- self.ph = foo[0]
- foo = self.getData('page.w') or self.getData('book.w')
- self.pw = foo[0]
- self.gx = self.getData('info.glyph.x')
- self.gy = self.getData('info.glyph.y')
- self.gid = self.getData('info.glyph.glyphID')
+
+ self.ph = -1
+ self.pw = -1
+ startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.ph = max(self.ph, int(argres))
+ startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.pw = max(self.pw, int(argres))
+
+ if self.ph <= 0:
+ self.ph = int(meta_array.get('pageHeight', '11000'))
+ if self.pw <= 0:
+ self.pw = int(meta_array.get('pageWidth', '8500'))
+
+ res = []
+ startpos = self.posinDoc('info.glyph.x')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.x', p)
+ res.extend(argres)
+ self.gx = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.y')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.y', p)
+ res.extend(argres)
+ self.gy = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.glyphID')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.glyphID', p)
+ res.extend(argres)
+ self.gid = res
+
+
+ # return tag at line pos in document
+ def lineinDoc(self, pos) :
+ if (pos >= 0) and (pos < self.docSize) :
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ return name, argres
+
+ # find tag in doc if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ if end == -1 :
+ end = self.docSize
+ else:
+ end = min(self.docSize, end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
+ def getDataatPos(self, path, pos):
+ result = None
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ if (name.endswith(path)):
+ result = argres
+ return result
+
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
def getImages(self):
result = []
self.temp = self.flatdoc
src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
+
def getGlyphs(self):
result = []
if (self.gid != None) and (len(self.gid) > 0):
return result
-def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
+def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
ml = ''
- pp = PParser(gdict, flat_xml)
+ pp = PParser(gdict, flat_xml, meta_array)
ml += '<?xml version="1.0" standalone="no"?>\n'
if (raw):
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
else:
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
ml += '<script><![CDATA[\n'
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
ml += 'var dpi=%d;\n' % scaledpi
- if (counter) :
- ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
- if (counter < numfiles-1) :
- ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
+ if (previd) :
+ ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
+ if (nextid) :
+ ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
ml += '</head>\n'
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
ml += '<div style="white-space:nowrap;">\n'
- if (counter == 0) :
+ if previd == None:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
+
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
if (pp.gid != None):
ml += '<defs>\n'
for j in xrange(0,len(pp.gid)):
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
- ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
+ xpos = "%d" % (pp.pw // 3)
+ ypos = "%d" % (pp.ph // 3)
+ ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
if (raw) :
ml += '</svg>'
else :
ml += '</svg></a>\n'
- if (counter == numfiles - 1) :
+ if nextid == None:
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# local support routines
if 'calibre' in sys.modules:
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside or string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
(ph, pw) = getPageDim(flat_xml)
if (ph == '-1') or (ph == '0') : ph = '11000'
if (pw == '-1') or (pw == '0') : pw = '8500'
-
- # print ' ', 'other0000.dat'
+ meta_array['pageHeight'] = ph
+ meta_array['pageWidth'] = pw
+ if 'fontSize' not in meta_array.keys():
+ meta_array['fontSize'] = fontsize
+
+ # process other.dat for css info and for map of page files to svg images
+ # this map is needed because some pages actually are made up of multiple
+ # pageXXXX.xml files
xname = os.path.join(bookDir, 'style.css')
flat_xml = convert2xml.fromData(dict, otherFile)
+
+ # extract info.original.pid to get original page information
+ pageIDMap = {}
+ pageidnums = stylexml2css.getpageIDMap(flat_xml)
+ if len(pageidnums) == 0:
+ filenames = os.listdir(pageDir)
+ numfiles = len(filenames)
+ for k in range(numfiles):
+ pageidnums.append(k)
+ # create a map from page ids to list of page file nums to process for that page
+ for i in range(len(pageidnums)):
+ id = pageidnums[i]
+ if id in pageIDMap.keys():
+ pageIDMap[id].append(i)
+ else:
+ pageIDMap[id] = [i]
+
+ # now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
file(xname, 'wb').write(cssstr)
xname = os.path.join(xmlDir, 'other0000.xml')
glyfile.close()
print " "
+ # build up tocentries while processing html
+ tocentries = ''
+
# start up the html
htmlFileName = "book.html"
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
# readability when rendering to the screen.
scaledpi = 1440.0
- svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
- svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
- svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
- svgindex += '<head>\n'
- svgindex += '<title>' + meta_array['Title'] + '</title>\n'
- svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
- svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
- if 'ASIN' in meta_array:
- svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
- if 'GUID' in meta_array:
- svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
- svgindex += '</head>\n'
- svgindex += '<body>\n'
-
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
numfiles = len(filenames)
- counter = 0
+
+ xmllst = []
for filename in filenames:
# print ' ', filename
print ".",
-
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
+ # keep flat_xml for later svg processing
+ xmllst.append(flat_xml)
+
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
# first get the html
- htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ tocentries += tocinfo
+ htmlstr += pagehtml
- # now get the svg image of the page
- svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
+ # finish up the html string and output it
+ htmlstr += '</body>\n</html>\n'
+ file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+
+ print " "
+ print 'Extracting Table of Contents from Amazon OCR'
+
+ # first create a table of contents file for the svg images
+ tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
+ tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ tochtml += '<head>\n'
+ tochtml += '<title>' + meta_array['Title'] + '</title>\n'
+ tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ tochtml += '</head>\n'
+ tochtml += '<body>\n'
+
+ tochtml += '<h2>Table of Contents</h2>\n'
+ start = pageidnums[0]
+ if (raw):
+ startname = 'page%04d.svg' % start
+ else:
+ startname = 'page%04d.xhtml' % start
+
+ tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
+ # build up a table of contents for the svg xhtml output
+ toclst = tocentries.split('\n')
+ toclst.pop()
+ for entry in toclst:
+ print entry
+ title, pagenum = entry.split('|')
+ id = pageidnums[int(pagenum)]
+ if (raw):
+ fname = 'page%04d.svg' % id
+ else:
+ fname = 'page%04d.xhtml' % id
+ tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
+ tochtml += '</body>\n'
+ tochtml += '</html>\n'
+ file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
- if (raw) :
- pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
- svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
- else :
- pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
- svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
+ # now create index_svg.xhtml that points to all required files
+ svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
+ svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ svgindex += '<head>\n'
+ svgindex += '<title>' + meta_array['Title'] + '</title>\n'
+ svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ svgindex += '</head>\n'
+ svgindex += '<body>\n'
+ print "Building svg images of each book page"
+ svgindex += '<h2>List of Pages</h2>\n'
+ svgindex += '<div>\n'
+ idlst = sorted(pageIDMap.keys())
+ numids = len(idlst)
+ cnt = len(idlst)
+ previd = None
+ for j in range(cnt):
+ pageid = idlst[j]
+ if j < cnt - 1:
+ nextid = idlst[j+1]
+ else:
+ nextid = None
+ print '.',
+ pagelst = pageIDMap[pageid]
+ flat_svg = ''
+ for page in pagelst:
+ flat_svg += xmllst[page]
+ svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
+ if (raw) :
+ pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
+ svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
+ else :
+ pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
+ svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
+ previd = pageid
pfile.write(svgxml)
pfile.close()
-
counter += 1
-
- print " "
-
- # finish up the html string and output it
- htmlstr += '</body>\n</html>\n'
- file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
-
- # finish up the svg index string and output it
+ svgindex += '</div>\n'
+ svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
svgindex += '</body>\n</html>\n'
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
+ print " "
+
# build the opf file
opfname = os.path.join(bookDir, 'book.opf')
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
return 1
raw = 0
- fixedimage = False
+ fixedimage = True
for o, a in opts:
if o =="-h":
usage()
# and many many others
-__version__ = '3.7'
+__version__ = '3.9'
class Unbuffered:
def __init__(self, stream):
import os, csv, getopt
import string
import re
+import traceback
class DrmException(Exception):
pass
print "Processing Book: ", title
filenametitle = cleanup_name(title)
outfilename = bookname
- if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
+ if len(outfilename)<=8 or len(filenametitle)<=8:
outfilename = outfilename + "_" + filenametitle
+ elif outfilename[:8] != filenametitle[:8]:
+ outfilename = outfilename[:8] + "_" + filenametitle
+
+ # avoid excessively long file names
+ if len(outfilename)>150:
+ outfilename = outfilename[:150]
# build pid list
md1, md2 = mb.getPIDMetaInfo()
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
mb.getHTMLZip(zipname)
- print " Creating SVG HTMLZ Archive"
- zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
+ print " Creating SVG ZIP Archive"
+ zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
mb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
pos = foundpos + 1
return startpos
+ # returns a vector of integers for the tagpath
+ def getData(self, tagpath, pos, end):
+ argres=[]
+ (foundat, argt) = self.findinDoc(tagpath, pos, end)
+ if (argt != None) and (len(argt) > 0) :
+ argList = argt.split('|')
+ argres = [ int(strval) for strval in argList]
+ return argres
def process(self):
# create a document parser
dp = DocParser(flatxml, fontsize, ph, pw)
-
csspage = dp.process()
-
return csspage
+
+
+def getpageIDMap(flatxml):
+ dp = DocParser(flatxml, 0, 0, 0)
+ pageidnumbers = dp.getData('info.original.pid', 0, -1)
+ return pageidnumbers
def __init__(self, filename):
self.fo = file(filename, 'rb')
self.outdir = tempfile.mkdtemp()
+ # self.outdir = 'rawdat'
self.bookPayloadOffset = 0
self.bookHeaderRecords = {}
self.bookMetadata = {}
def cleanup(self):
if os.path.isdir(self.outdir):
- shutil.rmtree(self.outdir, True)
+ pass
+ # shutil.rmtree(self.outdir, True)
def usage(progname):
print "Removes DRM protection from Topaz ebooks and extract the contents"
tb.getHTMLZip(zipname)
print " Creating SVG ZIP Archive"
- zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
+ zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
tb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
except TpzDRMError, e:
print str(e)
- tb.cleanup()
+ # tb.cleanup()
return 1
except Exception, e:
print str(e)
- tb.cleanup
+ # tb.cleanup
return 1
return 0
Credit given to The Dark Reverser for the original standalone script.'
supported_platforms = ['linux', 'osx', 'windows'] # Platforms this plugin will run on
author = 'DiapDealer' # The author of this plugin
- version = (0, 0, 5) # The version number of this plugin
+ version = (0, 0, 6) # The version number of this plugin
file_types = set(['pdb']) # The file types that this plugin will be applied to
on_import = True # Run this plugin during the import
minimum_calibre_version = (0, 7, 55)
print " Decoding File"
sect = erdr2pml.Sectionizer(infile, 'PNRdPPrs')
- er = erdr2pml.EreaderProcessor(sect.loadSection, name, cc)
+ er = erdr2pml.EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"
# 0.18 - on Windows try PyCrypto first and OpenSSL next
# 0.19 - Modify the interface to allow use of import
# 0.20 - modify to allow use inside new interface for calibre plugins
+# 0.21 - Support eReader (drm) version 11.
+# - Don't reject dictionary format.
+# - Ignore sidebars for dictionaries (different format?)
-__version__='0.20'
+__version__='0.21'
class Unbuffered:
def __init__(self, stream):
class Sectionizer(object):
+ bkType = "Book"
+
def __init__(self, filename, ident):
self.contents = file(filename, 'rb').read()
self.header = self.contents[0:72]
self.num_sections, = struct.unpack('>H', self.contents[76:78])
+ # Dictionary or normal content (TODO: Not hard-coded)
if self.header[0x3C:0x3C+8] != ident:
- raise ValueError('Invalid file format')
+ if self.header[0x3C:0x3C+8] == "PDctPPrs":
+ self.bkType = "Dict"
+ else:
+ raise ValueError('Invalid file format')
self.sections = []
for i in xrange(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
return r
class EreaderProcessor(object):
- def __init__(self, section_reader, username, creditcard):
- self.section_reader = section_reader
- data = section_reader(0)
+ def __init__(self, sect, username, creditcard):
+ self.section_reader = sect.loadSection
+ data = self.section_reader(0)
version, = struct.unpack('>H', data[0:2])
self.version = version
logging.info('eReader file format version %s', version)
if version != 272 and version != 260 and version != 259:
raise ValueError('incorrect eReader version %d (error 1)' % version)
- data = section_reader(1)
+ data = self.section_reader(1)
self.data = data
des = Des(fixKey(data[0:8]))
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
+ # Default values
+ self.num_footnote_pages = 0
+ self.num_sidebar_pages = 0
+ self.first_footnote_page = -1
+ self.first_sidebar_page = -1
if self.version == 272:
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
- self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
- self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
+ if (sect.bkType == "Book"):
+ self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
+ self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
else:
- self.num_footnote_pages = 0
- self.num_sidebar_pages = 0
- self.first_footnote_page = -1
- self.first_sidebar_page = -1
+ # Nothing needs to be done
+ pass
# self.num_bookinfo_pages = 0
# self.num_chapter_pages = 0
# self.num_link_pages = 0
encrypted_key_sha = r[44:44+20]
encrypted_key = r[64:64+8]
elif version == 260:
- if drm_sub_version != 13:
+ if drm_sub_version != 13 and drm_sub_version != 11:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
- encrypted_key = r[44:44+8]
- encrypted_key_sha = r[52:52+20]
+ if drm_sub_version == 13:
+ encrypted_key = r[44:44+8]
+ encrypted_key_sha = r[52:52+20]
+ else:
+ encrypted_key = r[64:64+8]
+ encrypted_key_sha = r[44:44+20]
elif version == 272:
encrypted_key = r[172:172+8]
encrypted_key_sha = r[56:56+20]
r += fmarker
fnote_ids = fnote_ids[id_len+4:]
+ # TODO: Handle dictionary index (?) pages - which are also marked as
+ # sidebar_pages (?). For now dictionary sidebars are ignored
+ # For dictionaries - record 0 is null terminated strings, followed by
+ # blocks of around 62000 bytes and a final block. Not sure of the
+ # encoding
+
# now handle sidebar pages
if self.num_sidebar_pages > 0:
r += '\n'
id_len = ord(sbar_ids[2])
id = sbar_ids[3:3+id_len]
smarker = '<sidebar id="%s">\n' % id
- smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
+ smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
smarker += '\n</sidebar>\n'
r += smarker
sbar_ids = sbar_ids[id_len+4:]
bookname = os.path.splitext(os.path.basename(infile))[0]
print " Decoding File"
sect = Sectionizer(infile, 'PNRdPPrs')
- er = EreaderProcessor(sect.loadSection, name, cc)
+ er = EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"
<key>CFBundleExecutable</key>
<string>droplet</string>
<key>CFBundleGetInfoString</key>
- <string>DeDRM 3.0, Written 2010–2011 by Apprentice Alf and others.</string>
+ <string>DeDRM 3.1, Written 2010–2011 by Apprentice Alf and others.</string>
<key>CFBundleIconFile</key>
<string>droplet</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
- <string>DeDRM 3.0</string>
+ <string>DeDRM 3.1</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>CFBundleShortVersionString</key>
- <string>3.0</string>
+ <string>3.1</string>
<key>CFBundleSignature</key>
<string>dplt</string>
<key>LSMinimumSystemVersion</key>
<true/>
<key>WindowState</key>
<dict>
+ <key>dividerCollapsed</key>
+ <false/>
+ <key>eventLogLevel</key>
+ <integer>-1</integer>
<key>name</key>
<string>ScriptWindowState</string>
<key>positionOfDivider</key>
- <real>274</real>
+ <real>460</real>
<key>savedFrame</key>
- <string>39 376 439 476 0 0 1440 878 </string>
+ <string>39 106 1316 746 0 0 1440 878 </string>
<key>selectedTabView</key>
- <string>result</string>
+ <string>event log</string>
</dict>
</dict>
</plist>
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside of string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
- '-span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.gridSize' : (1, 'scalar_number', 0, 0),
+ '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
+ 'span.gridSize' : (1, 'scalar_number', 0, 0),
+ 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
-{\rtf1\ansi\ansicpg1252\cocoartf949\cocoasubrtf540
+{\rtf1\ansi\ansicpg1252\cocoartf1038\cocoasubrtf360
{\fonttbl}
{\colortbl;\red255\green255\blue255;}
}
\ No newline at end of file
# 0.18 - on Windows try PyCrypto first and OpenSSL next
# 0.19 - Modify the interface to allow use of import
# 0.20 - modify to allow use inside new interface for calibre plugins
+# 0.21 - Support eReader (drm) version 11.
+# - Don't reject dictionary format.
+# - Ignore sidebars for dictionaries (different format?)
-__version__='0.20'
+__version__='0.21'
class Unbuffered:
def __init__(self, stream):
class Sectionizer(object):
+ bkType = "Book"
+
def __init__(self, filename, ident):
self.contents = file(filename, 'rb').read()
self.header = self.contents[0:72]
self.num_sections, = struct.unpack('>H', self.contents[76:78])
+ # Dictionary or normal content (TODO: Not hard-coded)
if self.header[0x3C:0x3C+8] != ident:
- raise ValueError('Invalid file format')
+ if self.header[0x3C:0x3C+8] == "PDctPPrs":
+ self.bkType = "Dict"
+ else:
+ raise ValueError('Invalid file format')
self.sections = []
for i in xrange(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
return r
class EreaderProcessor(object):
- def __init__(self, section_reader, username, creditcard):
- self.section_reader = section_reader
- data = section_reader(0)
+ def __init__(self, sect, username, creditcard):
+ self.section_reader = sect.loadSection
+ data = self.section_reader(0)
version, = struct.unpack('>H', data[0:2])
self.version = version
logging.info('eReader file format version %s', version)
if version != 272 and version != 260 and version != 259:
raise ValueError('incorrect eReader version %d (error 1)' % version)
- data = section_reader(1)
+ data = self.section_reader(1)
self.data = data
des = Des(fixKey(data[0:8]))
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
+ # Default values
+ self.num_footnote_pages = 0
+ self.num_sidebar_pages = 0
+ self.first_footnote_page = -1
+ self.first_sidebar_page = -1
if self.version == 272:
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
- self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
- self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
+ if (sect.bkType == "Book"):
+ self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
+ self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
else:
- self.num_footnote_pages = 0
- self.num_sidebar_pages = 0
- self.first_footnote_page = -1
- self.first_sidebar_page = -1
+ # Nothing needs to be done
+ pass
# self.num_bookinfo_pages = 0
# self.num_chapter_pages = 0
# self.num_link_pages = 0
encrypted_key_sha = r[44:44+20]
encrypted_key = r[64:64+8]
elif version == 260:
- if drm_sub_version != 13:
+ if drm_sub_version != 13 and drm_sub_version != 11:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
- encrypted_key = r[44:44+8]
- encrypted_key_sha = r[52:52+20]
+ if drm_sub_version == 13:
+ encrypted_key = r[44:44+8]
+ encrypted_key_sha = r[52:52+20]
+ else:
+ encrypted_key = r[64:64+8]
+ encrypted_key_sha = r[44:44+20]
elif version == 272:
encrypted_key = r[172:172+8]
encrypted_key_sha = r[56:56+20]
r += fmarker
fnote_ids = fnote_ids[id_len+4:]
+ # TODO: Handle dictionary index (?) pages - which are also marked as
+ # sidebar_pages (?). For now dictionary sidebars are ignored
+ # For dictionaries - record 0 is null terminated strings, followed by
+ # blocks of around 62000 bytes and a final block. Not sure of the
+ # encoding
+
# now handle sidebar pages
if self.num_sidebar_pages > 0:
r += '\n'
id_len = ord(sbar_ids[2])
id = sbar_ids[3:3+id_len]
smarker = '<sidebar id="%s">\n' % id
- smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
+ smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
smarker += '\n</sidebar>\n'
r += smarker
sbar_ids = sbar_ids[id_len+4:]
bookname = os.path.splitext(os.path.basename(infile))[0]
print " Decoding File"
sect = Sectionizer(infile, 'PNRdPPrs')
- er = EreaderProcessor(sect.loadSection, name, cc)
+ er = EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"
pclass = self.getClass(pclass)
+ # if paragraph uses extratokens (extra glyphs) then make it fixed
+ (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
+ makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
word_class = ''
+ word_semantic_type = ''
+
while (line < end) :
(name, argres) = self.lineinDoc(line)
return parares
+ def buildTOCEntry(self, pdesc) :
+ parares = ''
+ sep =''
+ tocentry = ''
+ handle_links = len(self.link_id) > 0
+
+ lstart = 0
+
+ cnt = len(pdesc)
+ for j in xrange( 0, cnt) :
+
+ (wtype, num) = pdesc[j]
+
+ if wtype == 'ocr' :
+ word = self.ocrtext[num]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[num]
+ if (link > 0):
+ linktype = self.link_type[link-1]
+ title = self.link_title[link-1]
+ title = title.rstrip('. ')
+ alt_title = parares[lstart:]
+ alt_title = alt_title.strip()
+ # now strip off the actual printed page number
+ alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
+ alt_title = alt_title.rstrip('. ')
+ # skip over any external links - can't have them in a books toc
+ if linktype == 'external' :
+ title = ''
+ alt_title = ''
+ linkpage = ''
+ else :
+ if len(self.link_page) >= link :
+ ptarget = self.link_page[link-1] - 1
+ linkpage = '%04d' % ptarget
+ else :
+ # just link to the current page
+ linkpage = self.id[4:]
+ if len(alt_title) >= len(title):
+ title = alt_title
+ if title != '' and linkpage != '':
+ tocentry += title + '|' + linkpage + '\n'
+ lstart = len(parares)
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ word = ''
+ sep = ''
+
+ if num in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ else :
+ continue
+
+ return tocentry
+
+
+
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
+ tocinfo = ''
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
+ tocinfo += self.buildTOCEntry(pdesc)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full'
if inGroup:
htmlpage = htmlpage[0:-4]
last_para_continued = False
- return htmlpage
-
+ return htmlpage, tocinfo
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
# create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
- htmlpage = dp.process()
- return htmlpage
+ htmlpage, tocinfo = dp.process()
+ return htmlpage, tocinfo
class PParser(object):
- def __init__(self, gd, flatxml):
+ def __init__(self, gd, flatxml, meta_array):
self.gd = gd
self.flatdoc = flatxml.split('\n')
+ self.docSize = len(self.flatdoc)
self.temp = []
- foo = self.getData('page.h') or self.getData('book.h')
- self.ph = foo[0]
- foo = self.getData('page.w') or self.getData('book.w')
- self.pw = foo[0]
- self.gx = self.getData('info.glyph.x')
- self.gy = self.getData('info.glyph.y')
- self.gid = self.getData('info.glyph.glyphID')
+
+ self.ph = -1
+ self.pw = -1
+ startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.ph = max(self.ph, int(argres))
+ startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.pw = max(self.pw, int(argres))
+
+ if self.ph <= 0:
+ self.ph = int(meta_array.get('pageHeight', '11000'))
+ if self.pw <= 0:
+ self.pw = int(meta_array.get('pageWidth', '8500'))
+
+ res = []
+ startpos = self.posinDoc('info.glyph.x')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.x', p)
+ res.extend(argres)
+ self.gx = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.y')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.y', p)
+ res.extend(argres)
+ self.gy = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.glyphID')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.glyphID', p)
+ res.extend(argres)
+ self.gid = res
+
+
+ # return tag at line pos in document
+ def lineinDoc(self, pos) :
+ if (pos >= 0) and (pos < self.docSize) :
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ return name, argres
+
+ # find tag in doc if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ if end == -1 :
+ end = self.docSize
+ else:
+ end = min(self.docSize, end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
+ def getDataatPos(self, path, pos):
+ result = None
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ if (name.endswith(path)):
+ result = argres
+ return result
+
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
def getImages(self):
result = []
self.temp = self.flatdoc
src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
+
def getGlyphs(self):
result = []
if (self.gid != None) and (len(self.gid) > 0):
return result
-def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
+def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
ml = ''
- pp = PParser(gdict, flat_xml)
+ pp = PParser(gdict, flat_xml, meta_array)
ml += '<?xml version="1.0" standalone="no"?>\n'
if (raw):
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
else:
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
ml += '<script><![CDATA[\n'
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
ml += 'var dpi=%d;\n' % scaledpi
- if (counter) :
- ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
- if (counter < numfiles-1) :
- ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
+ if (previd) :
+ ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
+ if (nextid) :
+ ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
ml += '</head>\n'
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
ml += '<div style="white-space:nowrap;">\n'
- if (counter == 0) :
+ if previd == None:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
+
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
if (pp.gid != None):
ml += '<defs>\n'
for j in xrange(0,len(pp.gid)):
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
- ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
+ xpos = "%d" % (pp.pw // 3)
+ ypos = "%d" % (pp.ph // 3)
+ ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
if (raw) :
ml += '</svg>'
else :
ml += '</svg></a>\n'
- if (counter == numfiles - 1) :
+ if nextid == None:
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# local support routines
if 'calibre' in sys.modules:
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside or string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
(ph, pw) = getPageDim(flat_xml)
if (ph == '-1') or (ph == '0') : ph = '11000'
if (pw == '-1') or (pw == '0') : pw = '8500'
-
- # print ' ', 'other0000.dat'
+ meta_array['pageHeight'] = ph
+ meta_array['pageWidth'] = pw
+ if 'fontSize' not in meta_array.keys():
+ meta_array['fontSize'] = fontsize
+
+ # process other.dat for css info and for map of page files to svg images
+ # this map is needed because some pages actually are made up of multiple
+ # pageXXXX.xml files
xname = os.path.join(bookDir, 'style.css')
flat_xml = convert2xml.fromData(dict, otherFile)
+
+ # extract info.original.pid to get original page information
+ pageIDMap = {}
+ pageidnums = stylexml2css.getpageIDMap(flat_xml)
+ if len(pageidnums) == 0:
+ filenames = os.listdir(pageDir)
+ numfiles = len(filenames)
+ for k in range(numfiles):
+ pageidnums.append(k)
+ # create a map from page ids to list of page file nums to process for that page
+ for i in range(len(pageidnums)):
+ id = pageidnums[i]
+ if id in pageIDMap.keys():
+ pageIDMap[id].append(i)
+ else:
+ pageIDMap[id] = [i]
+
+ # now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
file(xname, 'wb').write(cssstr)
xname = os.path.join(xmlDir, 'other0000.xml')
glyfile.close()
print " "
+ # build up tocentries while processing html
+ tocentries = ''
+
# start up the html
htmlFileName = "book.html"
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
# readability when rendering to the screen.
scaledpi = 1440.0
- svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
- svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
- svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
- svgindex += '<head>\n'
- svgindex += '<title>' + meta_array['Title'] + '</title>\n'
- svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
- svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
- if 'ASIN' in meta_array:
- svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
- if 'GUID' in meta_array:
- svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
- svgindex += '</head>\n'
- svgindex += '<body>\n'
-
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
numfiles = len(filenames)
- counter = 0
+
+ xmllst = []
for filename in filenames:
# print ' ', filename
print ".",
-
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
+ # keep flat_xml for later svg processing
+ xmllst.append(flat_xml)
+
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
# first get the html
- htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ tocentries += tocinfo
+ htmlstr += pagehtml
- # now get the svg image of the page
- svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
+ # finish up the html string and output it
+ htmlstr += '</body>\n</html>\n'
+ file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+
+ print " "
+ print 'Extracting Table of Contents from Amazon OCR'
+
+ # first create a table of contents file for the svg images
+ tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
+ tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ tochtml += '<head>\n'
+ tochtml += '<title>' + meta_array['Title'] + '</title>\n'
+ tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ tochtml += '</head>\n'
+ tochtml += '<body>\n'
+
+ tochtml += '<h2>Table of Contents</h2>\n'
+ start = pageidnums[0]
+ if (raw):
+ startname = 'page%04d.svg' % start
+ else:
+ startname = 'page%04d.xhtml' % start
+
+ tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
+ # build up a table of contents for the svg xhtml output
+ toclst = tocentries.split('\n')
+ toclst.pop()
+ for entry in toclst:
+ print entry
+ title, pagenum = entry.split('|')
+ id = pageidnums[int(pagenum)]
+ if (raw):
+ fname = 'page%04d.svg' % id
+ else:
+ fname = 'page%04d.xhtml' % id
+ tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
+ tochtml += '</body>\n'
+ tochtml += '</html>\n'
+ file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
- if (raw) :
- pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
- svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
- else :
- pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
- svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
+ # now create index_svg.xhtml that points to all required files
+ svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
+ svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ svgindex += '<head>\n'
+ svgindex += '<title>' + meta_array['Title'] + '</title>\n'
+ svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ svgindex += '</head>\n'
+ svgindex += '<body>\n'
+ print "Building svg images of each book page"
+ svgindex += '<h2>List of Pages</h2>\n'
+ svgindex += '<div>\n'
+ idlst = sorted(pageIDMap.keys())
+ numids = len(idlst)
+ cnt = len(idlst)
+ previd = None
+ for j in range(cnt):
+ pageid = idlst[j]
+ if j < cnt - 1:
+ nextid = idlst[j+1]
+ else:
+ nextid = None
+ print '.',
+ pagelst = pageIDMap[pageid]
+ flat_svg = ''
+ for page in pagelst:
+ flat_svg += xmllst[page]
+ svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
+ if (raw) :
+ pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
+ svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
+ else :
+ pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
+ svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
+ previd = pageid
pfile.write(svgxml)
pfile.close()
-
counter += 1
-
- print " "
-
- # finish up the html string and output it
- htmlstr += '</body>\n</html>\n'
- file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
-
- # finish up the svg index string and output it
+ svgindex += '</div>\n'
+ svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
svgindex += '</body>\n</html>\n'
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
+ print " "
+
# build the opf file
opfname = os.path.join(bookDir, 'book.opf')
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
return 1
raw = 0
- fixedimage = False
+ fixedimage = True
for o, a in opts:
if o =="-h":
usage()
# and many many others
-__version__ = '3.7'
+__version__ = '3.9'
class Unbuffered:
def __init__(self, stream):
import os, csv, getopt
import string
import re
+import traceback
class DrmException(Exception):
pass
print "Processing Book: ", title
filenametitle = cleanup_name(title)
outfilename = bookname
- if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
+ if len(outfilename)<=8 or len(filenametitle)<=8:
outfilename = outfilename + "_" + filenametitle
+ elif outfilename[:8] != filenametitle[:8]:
+ outfilename = outfilename[:8] + "_" + filenametitle
+
+ # avoid excessively long file names
+ if len(outfilename)>150:
+ outfilename = outfilename[:150]
# build pid list
md1, md2 = mb.getPIDMetaInfo()
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
mb.getHTMLZip(zipname)
- print " Creating SVG HTMLZ Archive"
- zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
+ print " Creating SVG ZIP Archive"
+ zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
mb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
pos = foundpos + 1
return startpos
+ # returns a vector of integers for the tagpath
+ def getData(self, tagpath, pos, end):
+ argres=[]
+ (foundat, argt) = self.findinDoc(tagpath, pos, end)
+ if (argt != None) and (len(argt) > 0) :
+ argList = argt.split('|')
+ argres = [ int(strval) for strval in argList]
+ return argres
def process(self):
# create a document parser
dp = DocParser(flatxml, fontsize, ph, pw)
-
csspage = dp.process()
-
return csspage
+
+
+def getpageIDMap(flatxml):
+ dp = DocParser(flatxml, 0, 0, 0)
+ pageidnumbers = dp.getData('info.original.pid', 0, -1)
+ return pageidnumbers
def __init__(self, filename):
self.fo = file(filename, 'rb')
self.outdir = tempfile.mkdtemp()
+ # self.outdir = 'rawdat'
self.bookPayloadOffset = 0
self.bookHeaderRecords = {}
self.bookMetadata = {}
def cleanup(self):
if os.path.isdir(self.outdir):
- shutil.rmtree(self.outdir, True)
+ pass
+ # shutil.rmtree(self.outdir, True)
def usage(progname):
print "Removes DRM protection from Topaz ebooks and extract the contents"
tb.getHTMLZip(zipname)
print " Creating SVG ZIP Archive"
- zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
+ zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
tb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
except TpzDRMError, e:
print str(e)
- tb.cleanup()
+ # tb.cleanup()
return 1
except Exception, e:
print str(e)
- tb.cleanup
+ # tb.cleanup
return 1
return 0
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside of string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
- '-span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.gridSize' : (1, 'scalar_number', 0, 0),
+ '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
+ 'span.gridSize' : (1, 'scalar_number', 0, 0),
+ 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
# 0.18 - on Windows try PyCrypto first and OpenSSL next
# 0.19 - Modify the interface to allow use of import
# 0.20 - modify to allow use inside new interface for calibre plugins
+# 0.21 - Support eReader (drm) version 11.
+# - Don't reject dictionary format.
+# - Ignore sidebars for dictionaries (different format?)
-__version__='0.20'
+__version__='0.21'
class Unbuffered:
def __init__(self, stream):
class Sectionizer(object):
+ bkType = "Book"
+
def __init__(self, filename, ident):
self.contents = file(filename, 'rb').read()
self.header = self.contents[0:72]
self.num_sections, = struct.unpack('>H', self.contents[76:78])
+ # Dictionary or normal content (TODO: Not hard-coded)
if self.header[0x3C:0x3C+8] != ident:
- raise ValueError('Invalid file format')
+ if self.header[0x3C:0x3C+8] == "PDctPPrs":
+ self.bkType = "Dict"
+ else:
+ raise ValueError('Invalid file format')
self.sections = []
for i in xrange(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
return r
class EreaderProcessor(object):
- def __init__(self, section_reader, username, creditcard):
- self.section_reader = section_reader
- data = section_reader(0)
+ def __init__(self, sect, username, creditcard):
+ self.section_reader = sect.loadSection
+ data = self.section_reader(0)
version, = struct.unpack('>H', data[0:2])
self.version = version
logging.info('eReader file format version %s', version)
if version != 272 and version != 260 and version != 259:
raise ValueError('incorrect eReader version %d (error 1)' % version)
- data = section_reader(1)
+ data = self.section_reader(1)
self.data = data
des = Des(fixKey(data[0:8]))
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
+ # Default values
+ self.num_footnote_pages = 0
+ self.num_sidebar_pages = 0
+ self.first_footnote_page = -1
+ self.first_sidebar_page = -1
if self.version == 272:
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
- self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
- self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
+ if (sect.bkType == "Book"):
+ self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
+ self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
else:
- self.num_footnote_pages = 0
- self.num_sidebar_pages = 0
- self.first_footnote_page = -1
- self.first_sidebar_page = -1
+ # Nothing needs to be done
+ pass
# self.num_bookinfo_pages = 0
# self.num_chapter_pages = 0
# self.num_link_pages = 0
encrypted_key_sha = r[44:44+20]
encrypted_key = r[64:64+8]
elif version == 260:
- if drm_sub_version != 13:
+ if drm_sub_version != 13 and drm_sub_version != 11:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
- encrypted_key = r[44:44+8]
- encrypted_key_sha = r[52:52+20]
+ if drm_sub_version == 13:
+ encrypted_key = r[44:44+8]
+ encrypted_key_sha = r[52:52+20]
+ else:
+ encrypted_key = r[64:64+8]
+ encrypted_key_sha = r[44:44+20]
elif version == 272:
encrypted_key = r[172:172+8]
encrypted_key_sha = r[56:56+20]
r += fmarker
fnote_ids = fnote_ids[id_len+4:]
+ # TODO: Handle dictionary index (?) pages - which are also marked as
+ # sidebar_pages (?). For now dictionary sidebars are ignored
+ # For dictionaries - record 0 is null terminated strings, followed by
+ # blocks of around 62000 bytes and a final block. Not sure of the
+ # encoding
+
# now handle sidebar pages
if self.num_sidebar_pages > 0:
r += '\n'
id_len = ord(sbar_ids[2])
id = sbar_ids[3:3+id_len]
smarker = '<sidebar id="%s">\n' % id
- smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
+ smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
smarker += '\n</sidebar>\n'
r += smarker
sbar_ids = sbar_ids[id_len+4:]
bookname = os.path.splitext(os.path.basename(infile))[0]
print " Decoding File"
sect = Sectionizer(infile, 'PNRdPPrs')
- er = EreaderProcessor(sect.loadSection, name, cc)
+ er = EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"
pclass = self.getClass(pclass)
+ # if paragraph uses extratokens (extra glyphs) then make it fixed
+ (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
+ makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
word_class = ''
+ word_semantic_type = ''
+
while (line < end) :
(name, argres) = self.lineinDoc(line)
return parares
+ def buildTOCEntry(self, pdesc) :
+ parares = ''
+ sep =''
+ tocentry = ''
+ handle_links = len(self.link_id) > 0
+
+ lstart = 0
+
+ cnt = len(pdesc)
+ for j in xrange( 0, cnt) :
+
+ (wtype, num) = pdesc[j]
+
+ if wtype == 'ocr' :
+ word = self.ocrtext[num]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[num]
+ if (link > 0):
+ linktype = self.link_type[link-1]
+ title = self.link_title[link-1]
+ title = title.rstrip('. ')
+ alt_title = parares[lstart:]
+ alt_title = alt_title.strip()
+ # now strip off the actual printed page number
+ alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
+ alt_title = alt_title.rstrip('. ')
+ # skip over any external links - can't have them in a books toc
+ if linktype == 'external' :
+ title = ''
+ alt_title = ''
+ linkpage = ''
+ else :
+ if len(self.link_page) >= link :
+ ptarget = self.link_page[link-1] - 1
+ linkpage = '%04d' % ptarget
+ else :
+ # just link to the current page
+ linkpage = self.id[4:]
+ if len(alt_title) >= len(title):
+ title = alt_title
+ if title != '' and linkpage != '':
+ tocentry += title + '|' + linkpage + '\n'
+ lstart = len(parares)
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ word = ''
+ sep = ''
+
+ if num in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ else :
+ continue
+
+ return tocentry
+
+
+
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
+ tocinfo = ''
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
+ tocinfo += self.buildTOCEntry(pdesc)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full'
if inGroup:
htmlpage = htmlpage[0:-4]
last_para_continued = False
- return htmlpage
-
+ return htmlpage, tocinfo
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
# create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
- htmlpage = dp.process()
- return htmlpage
+ htmlpage, tocinfo = dp.process()
+ return htmlpage, tocinfo
class PParser(object):
- def __init__(self, gd, flatxml):
+ def __init__(self, gd, flatxml, meta_array):
self.gd = gd
self.flatdoc = flatxml.split('\n')
+ self.docSize = len(self.flatdoc)
self.temp = []
- foo = self.getData('page.h') or self.getData('book.h')
- self.ph = foo[0]
- foo = self.getData('page.w') or self.getData('book.w')
- self.pw = foo[0]
- self.gx = self.getData('info.glyph.x')
- self.gy = self.getData('info.glyph.y')
- self.gid = self.getData('info.glyph.glyphID')
+
+ self.ph = -1
+ self.pw = -1
+ startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.ph = max(self.ph, int(argres))
+ startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.pw = max(self.pw, int(argres))
+
+ if self.ph <= 0:
+ self.ph = int(meta_array.get('pageHeight', '11000'))
+ if self.pw <= 0:
+ self.pw = int(meta_array.get('pageWidth', '8500'))
+
+ res = []
+ startpos = self.posinDoc('info.glyph.x')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.x', p)
+ res.extend(argres)
+ self.gx = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.y')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.y', p)
+ res.extend(argres)
+ self.gy = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.glyphID')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.glyphID', p)
+ res.extend(argres)
+ self.gid = res
+
+
+ # return tag at line pos in document
+ def lineinDoc(self, pos) :
+ if (pos >= 0) and (pos < self.docSize) :
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ return name, argres
+
+ # find tag in doc if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ if end == -1 :
+ end = self.docSize
+ else:
+ end = min(self.docSize, end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
+ def getDataatPos(self, path, pos):
+ result = None
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ if (name.endswith(path)):
+ result = argres
+ return result
+
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
def getImages(self):
result = []
self.temp = self.flatdoc
src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
+
def getGlyphs(self):
result = []
if (self.gid != None) and (len(self.gid) > 0):
return result
-def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
+def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
ml = ''
- pp = PParser(gdict, flat_xml)
+ pp = PParser(gdict, flat_xml, meta_array)
ml += '<?xml version="1.0" standalone="no"?>\n'
if (raw):
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
else:
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
ml += '<script><![CDATA[\n'
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
ml += 'var dpi=%d;\n' % scaledpi
- if (counter) :
- ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
- if (counter < numfiles-1) :
- ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
+ if (previd) :
+ ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
+ if (nextid) :
+ ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
ml += '</head>\n'
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
ml += '<div style="white-space:nowrap;">\n'
- if (counter == 0) :
+ if previd == None:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
+
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
if (pp.gid != None):
ml += '<defs>\n'
for j in xrange(0,len(pp.gid)):
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
- ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
+ xpos = "%d" % (pp.pw // 3)
+ ypos = "%d" % (pp.ph // 3)
+ ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
if (raw) :
ml += '</svg>'
else :
ml += '</svg></a>\n'
- if (counter == numfiles - 1) :
+ if nextid == None:
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# local support routines
if 'calibre' in sys.modules:
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside or string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
(ph, pw) = getPageDim(flat_xml)
if (ph == '-1') or (ph == '0') : ph = '11000'
if (pw == '-1') or (pw == '0') : pw = '8500'
-
- # print ' ', 'other0000.dat'
+ meta_array['pageHeight'] = ph
+ meta_array['pageWidth'] = pw
+ if 'fontSize' not in meta_array.keys():
+ meta_array['fontSize'] = fontsize
+
+ # process other.dat for css info and for map of page files to svg images
+ # this map is needed because some pages actually are made up of multiple
+ # pageXXXX.xml files
xname = os.path.join(bookDir, 'style.css')
flat_xml = convert2xml.fromData(dict, otherFile)
+
+ # extract info.original.pid to get original page information
+ pageIDMap = {}
+ pageidnums = stylexml2css.getpageIDMap(flat_xml)
+ if len(pageidnums) == 0:
+ filenames = os.listdir(pageDir)
+ numfiles = len(filenames)
+ for k in range(numfiles):
+ pageidnums.append(k)
+ # create a map from page ids to list of page file nums to process for that page
+ for i in range(len(pageidnums)):
+ id = pageidnums[i]
+ if id in pageIDMap.keys():
+ pageIDMap[id].append(i)
+ else:
+ pageIDMap[id] = [i]
+
+ # now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
file(xname, 'wb').write(cssstr)
xname = os.path.join(xmlDir, 'other0000.xml')
glyfile.close()
print " "
+ # build up tocentries while processing html
+ tocentries = ''
+
# start up the html
htmlFileName = "book.html"
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
# readability when rendering to the screen.
scaledpi = 1440.0
- svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
- svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
- svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
- svgindex += '<head>\n'
- svgindex += '<title>' + meta_array['Title'] + '</title>\n'
- svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
- svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
- if 'ASIN' in meta_array:
- svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
- if 'GUID' in meta_array:
- svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
- svgindex += '</head>\n'
- svgindex += '<body>\n'
-
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
numfiles = len(filenames)
- counter = 0
+
+ xmllst = []
for filename in filenames:
# print ' ', filename
print ".",
-
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
+ # keep flat_xml for later svg processing
+ xmllst.append(flat_xml)
+
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
# first get the html
- htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ tocentries += tocinfo
+ htmlstr += pagehtml
- # now get the svg image of the page
- svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
+ # finish up the html string and output it
+ htmlstr += '</body>\n</html>\n'
+ file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+
+ print " "
+ print 'Extracting Table of Contents from Amazon OCR'
+
+ # first create a table of contents file for the svg images
+ tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
+ tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ tochtml += '<head>\n'
+ tochtml += '<title>' + meta_array['Title'] + '</title>\n'
+ tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ tochtml += '</head>\n'
+ tochtml += '<body>\n'
+
+ tochtml += '<h2>Table of Contents</h2>\n'
+ start = pageidnums[0]
+ if (raw):
+ startname = 'page%04d.svg' % start
+ else:
+ startname = 'page%04d.xhtml' % start
+
+ tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
+ # build up a table of contents for the svg xhtml output
+ toclst = tocentries.split('\n')
+ toclst.pop()
+ for entry in toclst:
+ print entry
+ title, pagenum = entry.split('|')
+ id = pageidnums[int(pagenum)]
+ if (raw):
+ fname = 'page%04d.svg' % id
+ else:
+ fname = 'page%04d.xhtml' % id
+ tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
+ tochtml += '</body>\n'
+ tochtml += '</html>\n'
+ file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
- if (raw) :
- pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
- svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
- else :
- pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
- svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
+ # now create index_svg.xhtml that points to all required files
+ svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
+ svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ svgindex += '<head>\n'
+ svgindex += '<title>' + meta_array['Title'] + '</title>\n'
+ svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ svgindex += '</head>\n'
+ svgindex += '<body>\n'
+ print "Building svg images of each book page"
+ svgindex += '<h2>List of Pages</h2>\n'
+ svgindex += '<div>\n'
+ idlst = sorted(pageIDMap.keys())
+ numids = len(idlst)
+ cnt = len(idlst)
+ previd = None
+ for j in range(cnt):
+ pageid = idlst[j]
+ if j < cnt - 1:
+ nextid = idlst[j+1]
+ else:
+ nextid = None
+ print '.',
+ pagelst = pageIDMap[pageid]
+ flat_svg = ''
+ for page in pagelst:
+ flat_svg += xmllst[page]
+ svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
+ if (raw) :
+ pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
+ svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
+ else :
+ pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
+ svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
+ previd = pageid
pfile.write(svgxml)
pfile.close()
-
counter += 1
-
- print " "
-
- # finish up the html string and output it
- htmlstr += '</body>\n</html>\n'
- file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
-
- # finish up the svg index string and output it
+ svgindex += '</div>\n'
+ svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
svgindex += '</body>\n</html>\n'
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
+ print " "
+
# build the opf file
opfname = os.path.join(bookDir, 'book.opf')
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
return 1
raw = 0
- fixedimage = False
+ fixedimage = True
for o, a in opts:
if o =="-h":
usage()
# and many many others
-__version__ = '3.7'
+__version__ = '3.9'
class Unbuffered:
def __init__(self, stream):
import os, csv, getopt
import string
import re
+import traceback
class DrmException(Exception):
pass
print "Processing Book: ", title
filenametitle = cleanup_name(title)
outfilename = bookname
- if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
+ if len(outfilename)<=8 or len(filenametitle)<=8:
outfilename = outfilename + "_" + filenametitle
+ elif outfilename[:8] != filenametitle[:8]:
+ outfilename = outfilename[:8] + "_" + filenametitle
+
+ # avoid excessively long file names
+ if len(outfilename)>150:
+ outfilename = outfilename[:150]
# build pid list
md1, md2 = mb.getPIDMetaInfo()
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
mb.getHTMLZip(zipname)
- print " Creating SVG HTMLZ Archive"
- zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
+ print " Creating SVG ZIP Archive"
+ zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
mb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
pos = foundpos + 1
return startpos
+ # returns a vector of integers for the tagpath
+ def getData(self, tagpath, pos, end):
+ argres=[]
+ (foundat, argt) = self.findinDoc(tagpath, pos, end)
+ if (argt != None) and (len(argt) > 0) :
+ argList = argt.split('|')
+ argres = [ int(strval) for strval in argList]
+ return argres
def process(self):
# create a document parser
dp = DocParser(flatxml, fontsize, ph, pw)
-
csspage = dp.process()
-
return csspage
+
+
+def getpageIDMap(flatxml):
+ dp = DocParser(flatxml, 0, 0, 0)
+ pageidnumbers = dp.getData('info.original.pid', 0, -1)
+ return pageidnumbers
def __init__(self, filename):
self.fo = file(filename, 'rb')
self.outdir = tempfile.mkdtemp()
+ # self.outdir = 'rawdat'
self.bookPayloadOffset = 0
self.bookHeaderRecords = {}
self.bookMetadata = {}
def cleanup(self):
if os.path.isdir(self.outdir):
- shutil.rmtree(self.outdir, True)
+ pass
+ # shutil.rmtree(self.outdir, True)
def usage(progname):
print "Removes DRM protection from Topaz ebooks and extract the contents"
tb.getHTMLZip(zipname)
print " Creating SVG ZIP Archive"
- zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
+ zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
tb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
except TpzDRMError, e:
print str(e)
- tb.cleanup()
+ # tb.cleanup()
return 1
except Exception, e:
print str(e)
- tb.cleanup
+ # tb.cleanup
return 1
return 0
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside of string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
- '-span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.lastWord' : (1, 'scalar_number', 0, 0),
+ '_span.gridSize' : (1, 'scalar_number', 0, 0),
+ '_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
+ 'span.gridSize' : (1, 'scalar_number', 0, 0),
+ 'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
+ 'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
pclass = self.getClass(pclass)
+ # if paragraph uses extratokens (extra glyphs) then make it fixed
+ (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
+ makeImage = makeImage or (extraglyphs != None)
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
word_class = ''
+ word_semantic_type = ''
+
while (line < end) :
(name, argres) = self.lineinDoc(line)
return parares
+ def buildTOCEntry(self, pdesc) :
+ parares = ''
+ sep =''
+ tocentry = ''
+ handle_links = len(self.link_id) > 0
+
+ lstart = 0
+
+ cnt = len(pdesc)
+ for j in xrange( 0, cnt) :
+
+ (wtype, num) = pdesc[j]
+
+ if wtype == 'ocr' :
+ word = self.ocrtext[num]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[num]
+ if (link > 0):
+ linktype = self.link_type[link-1]
+ title = self.link_title[link-1]
+ title = title.rstrip('. ')
+ alt_title = parares[lstart:]
+ alt_title = alt_title.strip()
+ # now strip off the actual printed page number
+ alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
+ alt_title = alt_title.rstrip('. ')
+ # skip over any external links - can't have them in a books toc
+ if linktype == 'external' :
+ title = ''
+ alt_title = ''
+ linkpage = ''
+ else :
+ if len(self.link_page) >= link :
+ ptarget = self.link_page[link-1] - 1
+ linkpage = '%04d' % ptarget
+ else :
+ # just link to the current page
+ linkpage = self.id[4:]
+ if len(alt_title) >= len(title):
+ title = alt_title
+ if title != '' and linkpage != '':
+ tocentry += title + '|' + linkpage + '\n'
+ lstart = len(parares)
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ word = ''
+ sep = ''
+
+ if num in self.dehyphen_rootid :
+ word = word[0:-1]
+ sep = ''
+
+ parares += word + sep
+
+ else :
+ continue
+
+ return tocentry
+
+
+
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
+ tocinfo = ''
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
+ tocinfo += self.buildTOCEntry(pdesc)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full'
if inGroup:
htmlpage = htmlpage[0:-4]
last_para_continued = False
- return htmlpage
-
+ return htmlpage, tocinfo
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
# create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
- htmlpage = dp.process()
- return htmlpage
+ htmlpage, tocinfo = dp.process()
+ return htmlpage, tocinfo
class PParser(object):
- def __init__(self, gd, flatxml):
+ def __init__(self, gd, flatxml, meta_array):
self.gd = gd
self.flatdoc = flatxml.split('\n')
+ self.docSize = len(self.flatdoc)
self.temp = []
- foo = self.getData('page.h') or self.getData('book.h')
- self.ph = foo[0]
- foo = self.getData('page.w') or self.getData('book.w')
- self.pw = foo[0]
- self.gx = self.getData('info.glyph.x')
- self.gy = self.getData('info.glyph.y')
- self.gid = self.getData('info.glyph.glyphID')
+
+ self.ph = -1
+ self.pw = -1
+ startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.ph = max(self.ph, int(argres))
+ startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
+ for p in startpos:
+ (name, argres) = self.lineinDoc(p)
+ self.pw = max(self.pw, int(argres))
+
+ if self.ph <= 0:
+ self.ph = int(meta_array.get('pageHeight', '11000'))
+ if self.pw <= 0:
+ self.pw = int(meta_array.get('pageWidth', '8500'))
+
+ res = []
+ startpos = self.posinDoc('info.glyph.x')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.x', p)
+ res.extend(argres)
+ self.gx = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.y')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.y', p)
+ res.extend(argres)
+ self.gy = res
+
+ res = []
+ startpos = self.posinDoc('info.glyph.glyphID')
+ for p in startpos:
+ argres = self.getDataatPos('info.glyph.glyphID', p)
+ res.extend(argres)
+ self.gid = res
+
+
+ # return tag at line pos in document
+ def lineinDoc(self, pos) :
+ if (pos >= 0) and (pos < self.docSize) :
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ return name, argres
+
+ # find tag in doc if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ if end == -1 :
+ end = self.docSize
+ else:
+ end = min(self.docSize, end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+ # return list of start positions for the tagpath
+ def posinDoc(self, tagpath):
+ startpos = []
+ pos = 0
+ res = ""
+ while res != None :
+ (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+ if res != None :
+ startpos.append(foundpos)
+ pos = foundpos + 1
+ return startpos
+
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
+ def getDataatPos(self, path, pos):
+ result = None
+ item = self.flatdoc[pos]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ if (name.endswith(path)):
+ result = argres
+ return result
+
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
+
def getImages(self):
result = []
self.temp = self.flatdoc
src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
+
def getGlyphs(self):
result = []
if (self.gid != None) and (len(self.gid) > 0):
return result
-def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
+def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
ml = ''
- pp = PParser(gdict, flat_xml)
+ pp = PParser(gdict, flat_xml, meta_array)
ml += '<?xml version="1.0" standalone="no"?>\n'
if (raw):
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
else:
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
- ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+ ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
ml += '<script><![CDATA[\n'
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
ml += 'var dpi=%d;\n' % scaledpi
- if (counter) :
- ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
- if (counter < numfiles-1) :
- ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
+ if (previd) :
+ ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
+ if (nextid) :
+ ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
ml += '</head>\n'
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
ml += '<div style="white-space:nowrap;">\n'
- if (counter == 0) :
+ if previd == None:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
+
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
if (pp.gid != None):
ml += '<defs>\n'
for j in xrange(0,len(pp.gid)):
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
- ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
+ xpos = "%d" % (pp.pw // 3)
+ ypos = "%d" % (pp.ph // 3)
+ ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
if (raw) :
ml += '</svg>'
else :
ml += '</svg></a>\n'
- if (counter == numfiles - 1) :
+ if nextid == None:
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
from struct import pack
from struct import unpack
+class TpzDRMError(Exception):
+ pass
# local support routines
if 'calibre' in sys.modules:
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
- sys.exit(-1)
+ raise TpzDRMError('outside or string table limits')
+ # sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
(ph, pw) = getPageDim(flat_xml)
if (ph == '-1') or (ph == '0') : ph = '11000'
if (pw == '-1') or (pw == '0') : pw = '8500'
-
- # print ' ', 'other0000.dat'
+ meta_array['pageHeight'] = ph
+ meta_array['pageWidth'] = pw
+ if 'fontSize' not in meta_array.keys():
+ meta_array['fontSize'] = fontsize
+
+ # process other.dat for css info and for map of page files to svg images
+ # this map is needed because some pages actually are made up of multiple
+ # pageXXXX.xml files
xname = os.path.join(bookDir, 'style.css')
flat_xml = convert2xml.fromData(dict, otherFile)
+
+ # extract info.original.pid to get original page information
+ pageIDMap = {}
+ pageidnums = stylexml2css.getpageIDMap(flat_xml)
+ if len(pageidnums) == 0:
+ filenames = os.listdir(pageDir)
+ numfiles = len(filenames)
+ for k in range(numfiles):
+ pageidnums.append(k)
+ # create a map from page ids to list of page file nums to process for that page
+ for i in range(len(pageidnums)):
+ id = pageidnums[i]
+ if id in pageIDMap.keys():
+ pageIDMap[id].append(i)
+ else:
+ pageIDMap[id] = [i]
+
+ # now get the css info
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
file(xname, 'wb').write(cssstr)
xname = os.path.join(xmlDir, 'other0000.xml')
glyfile.close()
print " "
+ # build up tocentries while processing html
+ tocentries = ''
+
# start up the html
htmlFileName = "book.html"
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
# readability when rendering to the screen.
scaledpi = 1440.0
- svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
- svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
- svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
- svgindex += '<head>\n'
- svgindex += '<title>' + meta_array['Title'] + '</title>\n'
- svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
- svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
- if 'ASIN' in meta_array:
- svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
- if 'GUID' in meta_array:
- svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
- svgindex += '</head>\n'
- svgindex += '<body>\n'
-
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
numfiles = len(filenames)
- counter = 0
+
+ xmllst = []
for filename in filenames:
# print ' ', filename
print ".",
-
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
+ # keep flat_xml for later svg processing
+ xmllst.append(flat_xml)
+
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
# first get the html
- htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+ tocentries += tocinfo
+ htmlstr += pagehtml
- # now get the svg image of the page
- svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
+ # finish up the html string and output it
+ htmlstr += '</body>\n</html>\n'
+ file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+
+ print " "
+ print 'Extracting Table of Contents from Amazon OCR'
+
+ # first create a table of contents file for the svg images
+ tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
+ tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ tochtml += '<head>\n'
+ tochtml += '<title>' + meta_array['Title'] + '</title>\n'
+ tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ tochtml += '</head>\n'
+ tochtml += '<body>\n'
+
+ tochtml += '<h2>Table of Contents</h2>\n'
+ start = pageidnums[0]
+ if (raw):
+ startname = 'page%04d.svg' % start
+ else:
+ startname = 'page%04d.xhtml' % start
+
+ tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
+ # build up a table of contents for the svg xhtml output
+ toclst = tocentries.split('\n')
+ toclst.pop()
+ for entry in toclst:
+ print entry
+ title, pagenum = entry.split('|')
+ id = pageidnums[int(pagenum)]
+ if (raw):
+ fname = 'page%04d.svg' % id
+ else:
+ fname = 'page%04d.xhtml' % id
+ tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
+ tochtml += '</body>\n'
+ tochtml += '</html>\n'
+ file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
- if (raw) :
- pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
- svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
- else :
- pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
- svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
+ # now create index_svg.xhtml that points to all required files
+ svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
+ svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+ svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+ svgindex += '<head>\n'
+ svgindex += '<title>' + meta_array['Title'] + '</title>\n'
+ svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+ svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+ if 'ASIN' in meta_array:
+ svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+ if 'GUID' in meta_array:
+ svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+ svgindex += '</head>\n'
+ svgindex += '<body>\n'
+ print "Building svg images of each book page"
+ svgindex += '<h2>List of Pages</h2>\n'
+ svgindex += '<div>\n'
+ idlst = sorted(pageIDMap.keys())
+ numids = len(idlst)
+ cnt = len(idlst)
+ previd = None
+ for j in range(cnt):
+ pageid = idlst[j]
+ if j < cnt - 1:
+ nextid = idlst[j+1]
+ else:
+ nextid = None
+ print '.',
+ pagelst = pageIDMap[pageid]
+ flat_svg = ''
+ for page in pagelst:
+ flat_svg += xmllst[page]
+ svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
+ if (raw) :
+ pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
+ svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
+ else :
+ pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
+ svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
+ previd = pageid
pfile.write(svgxml)
pfile.close()
-
counter += 1
-
- print " "
-
- # finish up the html string and output it
- htmlstr += '</body>\n</html>\n'
- file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
-
- # finish up the svg index string and output it
+ svgindex += '</div>\n'
+ svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
svgindex += '</body>\n</html>\n'
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
+ print " "
+
# build the opf file
opfname = os.path.join(bookDir, 'book.opf')
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
return 1
raw = 0
- fixedimage = False
+ fixedimage = True
for o, a in opts:
if o =="-h":
usage()
# and many many others
-__version__ = '3.7'
+__version__ = '3.9'
class Unbuffered:
def __init__(self, stream):
import os, csv, getopt
import string
import re
+import traceback
class DrmException(Exception):
pass
print "Processing Book: ", title
filenametitle = cleanup_name(title)
outfilename = bookname
- if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
+ if len(outfilename)<=8 or len(filenametitle)<=8:
outfilename = outfilename + "_" + filenametitle
+ elif outfilename[:8] != filenametitle[:8]:
+ outfilename = outfilename[:8] + "_" + filenametitle
+
+ # avoid excessively long file names
+ if len(outfilename)>150:
+ outfilename = outfilename[:150]
# build pid list
md1, md2 = mb.getPIDMetaInfo()
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
mb.getHTMLZip(zipname)
- print " Creating SVG HTMLZ Archive"
- zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
+ print " Creating SVG ZIP Archive"
+ zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
mb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
pos = foundpos + 1
return startpos
+ # returns a vector of integers for the tagpath
+ def getData(self, tagpath, pos, end):
+ argres=[]
+ (foundat, argt) = self.findinDoc(tagpath, pos, end)
+ if (argt != None) and (len(argt) > 0) :
+ argList = argt.split('|')
+ argres = [ int(strval) for strval in argList]
+ return argres
def process(self):
# create a document parser
dp = DocParser(flatxml, fontsize, ph, pw)
-
csspage = dp.process()
-
return csspage
+
+
+def getpageIDMap(flatxml):
+ dp = DocParser(flatxml, 0, 0, 0)
+ pageidnumbers = dp.getData('info.original.pid', 0, -1)
+ return pageidnumbers
def __init__(self, filename):
self.fo = file(filename, 'rb')
self.outdir = tempfile.mkdtemp()
+ # self.outdir = 'rawdat'
self.bookPayloadOffset = 0
self.bookHeaderRecords = {}
self.bookMetadata = {}
def cleanup(self):
if os.path.isdir(self.outdir):
- shutil.rmtree(self.outdir, True)
+ pass
+ # shutil.rmtree(self.outdir, True)
def usage(progname):
print "Removes DRM protection from Topaz ebooks and extract the contents"
tb.getHTMLZip(zipname)
print " Creating SVG ZIP Archive"
- zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
+ zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
tb.getSVGZip(zipname)
print " Creating XML ZIP Archive"
except TpzDRMError, e:
print str(e)
- tb.cleanup()
+ # tb.cleanup()
return 1
except Exception, e:
print str(e)
- tb.cleanup
+ # tb.cleanup
return 1
return 0
Linux Users
-----------
-Unfortuantely, the Calibre Plugins do not really work well on Linux because of issues running Calibre under Wine. Native versions of Calibre can not be used with the K4MobiDeDRM plugin because the plugin will not be able to find the information it needs to remove the DRM.
-
-Although some of the scripts do work on native Linux, others require the use of a recent version of Wine.
-
-Here are the instructions for using KindleBooks.pyw on Linux under Wine.
-
-1. upgrade to very recent versions of Wine; This has been tested with Wine 1.3.18 – 1.3.22. It may work with earlier versions but no promises.
-
-2. Some versions of winecfg have a bug in setting the volume serial number, so create a .windows-serial file at root of drive_c to set a proper windows volume serial number (8 digit hex value for unsigned integer).
-cd ~
-cd .wine
-cd drive_c
-echo deadbeaf > .windows-serial
-
-Replace deadbeef with whatever you want but I would stay away from the default setting of ffffffff
-
-3. Only ***after*** setting the volume serial number properly – download and install under wine K4PC version for Windows. Register it and download from your Archive one of your Kindle ebooks. Versions known to work are K4PC 1.4.1 and earlier. Later version may work but no promises.
-
-4. Download and install under wine ActiveState Active Python 2.7 for Windows 32bit
-
-5. Download and unzip tools_v4.5.zip
-
-6. Then run KindleBook.pyw ***under python running on wine*** using one of the following methods:
-
-From a Linux shell:
-
- wine python KindleBooks.pyw
-
-Or to get a Windows (wine) command prompt
-
- wine cmd
- python KindleBooks.pyw
-
-Or to get a "Windows" file explorer:
-
- winefile
-
- and then double-click on any .pyw files to run them in the wine environment
-
+Please see the ReadMe_Linux_Users.txt
--- /dev/null
+ReadMe for Linux Users and the Tools
+
+
+Linux and Kindle for PC (KindleBooks.pyw)
+------------------------------------------
+
+Here are the instructions for using Kindle for PC and KindleBooks.pyw on Linux under Wine. (Thank you Eyeless and Pete)
+
+1. upgrade to very recent versions of Wine; This has been tested with Wine 1.3.15 – 1.3.2X. It may work with earlier versions but no promises. It does not work with wine 1.2.X versions.
+
+If you have not already installed Kindle for PC under wine, follow steps 2 and 3 otherwise jump to step 4
+
+2. Some versions of winecfg have a bug in setting the volume serial number, so create a .windows-serial file at root of drive_c to set a proper windows volume serial number (8 digit hex value for unsigned integer).
+cd ~
+cd .wine
+cd drive_c
+echo deadbeef > .windows-serial
+
+Replace "deadbeef" with whatever hex value you want but I would stay away from the default setting of "ffffffff" which does not seem to work. BTW: deadbeef is itself a valid possible hex value if you want to use it
+
+3. Only ***after*** setting the volume serial number properly – download and install under wine K4PC version for Windows. Register it and download from your Archive one of your Kindle ebooks. Versions known to work are K4PC 1.7.1 and earlier. Later version may work but no promises.
+
+4. Download and install under wine ActiveState Active Python 2.7 for Windows 32bit
+
+5. Download and unzip tools_v4.X.zip
+
+6. Now make sure the executable bit is NOT set for KindleBooks.pyw as Linux will actually keep trying to ignore wine and launch it under Linux python which will cause it to fail.
+
+cd tools_v4.7/KindleBooks/
+chmod ugo-x KindleBooks.pyw
+
+7. Then run KindleBook.pyw ***under python running on wine*** using the Linux shell as follows:
+
+wine python KindleBooks.pyw
+
+Select the ebook file directly from your “My Kindle Content” folder, select a new/unused directory for the output. You should not need to enter any PID or Serial Number for Kindle for PC.
+
+
+
+
+
+
+Linux and Adobe Digital Editions ePubs
+--------------------------------------
+
+Here are the instructions for using the tools with ePub books and Adobe Digital Editions on Linux under Wine. (Thank you mclien!)
+
+
+1. download the most recent version of wine from winehq.org (1.3.29 in my case)
+
+For debian users:
+
+to get a recent version of wine I decited to use aptosid (2011-02, xfce)
+(because I’m used to debian)
+install aptosid and upgrade it (see aptosid site for detaild instructions)
+
+
+2. properly install Wine (see the Wine site for details)
+
+For debian users:
+
+cd to this dir and install the packages as root:
+‘dpkg -i *.deb’
+you will get some error messages, which can be ignored.
+again as root use
+‘apt-get -f install’ to correct this errors
+
+3. python 2.7 should already be installed on your system but you may need the following additional python package
+
+'apt-get install python-tk’
+
+4. all programms need to be installed as normal user. All these programm are installed the same way:
+‘wine ‘
+we need:
+a) Adobe Digital Edition 1.7.2(from: http://kb2.adobe.com/cps/403/kb403051.html)
+(there is a “can’t install ADE” site, where the setup.exe hides)
+b) ActivePython-2.7.2.5-win32-x86.msi (from: http://www.activestate.com/activepython/downloads)
+c) Win32OpenSSL_Light-0_9_8r.exe (from: http://www.slproweb.com/)
+d) pycrypto-2.3.win32-py2.7.msi (from: http://www.voidspace.org.uk/python/modules.shtml)
+
+5. now get and unpack the very latest tools_v4.X (from Apprentice Alf) in the users drive_c of wine
+(~/.wine/drive_c/)
+
+6. start ADE with:
+‘wine digitaleditions.exe’ or from the start menue wine-adobe-digital..
+
+7. register this instance of ADE with your adobeID and close it
+ change to the tools_v4.X dir:
+cd ~/.wine/drive_c/tools_v4.X/Adobe_ePub_Tools
+
+8. create the adeptkey.der with:
+‘wine python ineptkey_v5.4.pyw’ (only need once!)
+(key will be here: ~/.wine/drive_c/tools_v4.X/Adobe_ePub_Tools/adeptkey.der)
+
+9. Use ADE running under Wine to dowload all of your purchased ePub ebooks
+
+10. for each book you have downloaded via Adobe Digital Editions
+There is no need to use Wine for this step!
+
+'python ineptpub_v5.6.pyw’
+this will launch a window with 3 lines
+1. key: (allready filled in, otherwise it’s in the path where you did step 8.
+2. input file: drmbook.epub
+3. output file: name-ypu-want_for_free_book.epub
+
+Also… once you successfully generate your adept.der keyfile using WINE, you can use the regular ineptepub plugin with the standard Linux calibre. Just put the *.der file(s) in your calibre configuration directory.
+so if you want you can use calibre in Linux:
+
+11. install the plugins from the tools as discribed in the readmes for win
+
+12. copy the adeptkey.der into the config dir of calibre (~/.config/calibre in debian). Every book imported to calibre will automaticly freed from DRM.
+
+
print " Decoding File"
sect = erdr2pml.Sectionizer(infile, 'PNRdPPrs')
- er = erdr2pml.EreaderProcessor(sect.loadSection, name, cc)
+ er = erdr2pml.EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"
# 0.18 - on Windows try PyCrypto first and OpenSSL next
# 0.19 - Modify the interface to allow use of import
# 0.20 - modify to allow use inside new interface for calibre plugins
+# 0.21 - Support eReader (drm) version 11.
+# - Don't reject dictionary format.
+# - Ignore sidebars for dictionaries (different format?)
-__version__='0.20'
+__version__='0.21'
class Unbuffered:
def __init__(self, stream):
class Sectionizer(object):
+ bkType = "Book"
+
def __init__(self, filename, ident):
self.contents = file(filename, 'rb').read()
self.header = self.contents[0:72]
self.num_sections, = struct.unpack('>H', self.contents[76:78])
+ # Dictionary or normal content (TODO: Not hard-coded)
if self.header[0x3C:0x3C+8] != ident:
- raise ValueError('Invalid file format')
+ if self.header[0x3C:0x3C+8] == "PDctPPrs":
+ self.bkType = "Dict"
+ else:
+ raise ValueError('Invalid file format')
self.sections = []
for i in xrange(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
return r
class EreaderProcessor(object):
- def __init__(self, section_reader, username, creditcard):
- self.section_reader = section_reader
- data = section_reader(0)
+ def __init__(self, sect, username, creditcard):
+ self.section_reader = sect.loadSection
+ data = self.section_reader(0)
version, = struct.unpack('>H', data[0:2])
self.version = version
logging.info('eReader file format version %s', version)
if version != 272 and version != 260 and version != 259:
raise ValueError('incorrect eReader version %d (error 1)' % version)
- data = section_reader(1)
+ data = self.section_reader(1)
self.data = data
des = Des(fixKey(data[0:8]))
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
+ # Default values
+ self.num_footnote_pages = 0
+ self.num_sidebar_pages = 0
+ self.first_footnote_page = -1
+ self.first_sidebar_page = -1
if self.version == 272:
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
- self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
- self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
+ if (sect.bkType == "Book"):
+ self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
+ self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
else:
- self.num_footnote_pages = 0
- self.num_sidebar_pages = 0
- self.first_footnote_page = -1
- self.first_sidebar_page = -1
+ # Nothing needs to be done
+ pass
# self.num_bookinfo_pages = 0
# self.num_chapter_pages = 0
# self.num_link_pages = 0
encrypted_key_sha = r[44:44+20]
encrypted_key = r[64:64+8]
elif version == 260:
- if drm_sub_version != 13:
+ if drm_sub_version != 13 and drm_sub_version != 11:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
- encrypted_key = r[44:44+8]
- encrypted_key_sha = r[52:52+20]
+ if drm_sub_version == 13:
+ encrypted_key = r[44:44+8]
+ encrypted_key_sha = r[52:52+20]
+ else:
+ encrypted_key = r[64:64+8]
+ encrypted_key_sha = r[44:44+20]
elif version == 272:
encrypted_key = r[172:172+8]
encrypted_key_sha = r[56:56+20]
r += fmarker
fnote_ids = fnote_ids[id_len+4:]
+ # TODO: Handle dictionary index (?) pages - which are also marked as
+ # sidebar_pages (?). For now dictionary sidebars are ignored
+ # For dictionaries - record 0 is null terminated strings, followed by
+ # blocks of around 62000 bytes and a final block. Not sure of the
+ # encoding
+
# now handle sidebar pages
if self.num_sidebar_pages > 0:
r += '\n'
id_len = ord(sbar_ids[2])
id = sbar_ids[3:3+id_len]
smarker = '<sidebar id="%s">\n' % id
- smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
+ smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
smarker += '\n</sidebar>\n'
r += smarker
sbar_ids = sbar_ids[id_len+4:]
bookname = os.path.splitext(os.path.basename(infile))[0]
print " Decoding File"
sect = Sectionizer(infile, 'PNRdPPrs')
- er = EreaderProcessor(sect.loadSection, name, cc)
+ er = EreaderProcessor(sect, name, cc)
if er.getNumImages() > 0:
print " Extracting images"