# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
- 'book' : (1, 'snippets', 1, 0),
- 'version' : (1, 'snippets', 1, 0),
- 'stylesheet' : (1, 'snippets', 1, 0),
- 'links' : (0, 'number', 0, 1),
- 'pages' : (0, 'number', 0, 1),
- 'page' : (1, 'snippets', 1, 0),
- 'group' : (1, 'snippets', 1, 0),
- 'region' : (1, 'snippets', 1, 0),
- 'reflow' : (1, 'number', 1, 0),
- 'img' : (1, 'snippets', 1, 0),
- 'paragraph' : (1, 'snippets', 1, 0),
- 'extratokens' : (1, 'snippets', 1, 0),
- 'style' : (1, 'snippets', 1, 0),
- 'rule' : (1, 'snippets', 1, 0),
- '_span' : (1, 'snippets', 1, 0),
- 'word_semantic': (1, 'snippets', 1, 1),
- 'value' : (1, 'scalar_text', 0, 0),
+ 'x' : (1, 'scalar_number', 0, 0),
+ 'y' : (1, 'scalar_number', 0, 0),
'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0),
- 'x' : (1, 'number', 0, 0),
- 'y' : (1, 'number', 0, 0),
+ 'rootID' : (1, 'scalar_number', 0, 0),
+ 'stemID' : (1, 'scalar_number', 0, 0),
+ 'type' : (1, 'scalar_text', 0, 0),
+
+ 'info' : (0, 'number', 1, 0),
+
+ 'info.word' : (0, 'number', 1, 1),
+ 'info.word.ocrText' : (1, 'text', 0, 0),
+ 'info.word.firstGlyph' : (1, 'raw', 0, 0),
+ 'info.word.lastGlyph' : (1, 'raw', 0, 0),
+ 'info.word.bl' : (1, 'raw', 0, 0),
+ 'info.word.link_id' : (1, 'number', 0, 0),
+
+ 'glyph' : (0, 'number', 1, 1),
+ 'glyph.x' : (1, 'number', 0, 0),
+ 'glyph.y' : (1, 'number', 0, 0),
+ 'glyph.glyphID' : (1, 'number', 0, 0),
+
+ 'dehyphen' : (0, 'number', 1, 1),
+ 'dehyphen.rootID' : (1, 'number', 0, 0),
+ 'dehyphen.stemID' : (1, 'number', 0, 0),
+ 'dehyphen.stemPage' : (1, 'number', 0, 0),
+ 'dehyphen.sh' : (1, 'number', 0, 0),
+
+ 'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0),
- 'link_id' : (1, 'number', 0, 0),
- 'glyph' : (0, 'number', 1, 1),
+ 'links.rel' : (1, 'number', 0, 0),
+ 'links.row' : (1, 'number', 0, 0),
+ 'links.title' : (1, 'text', 0, 0),
+ 'links.href' : (1, 'text', 0, 0),
+ 'links.type' : (1, 'text', 0, 0),
+
+ 'paraCont' : (0, 'number', 1, 1),
+ 'paraCont.rootID' : (1, 'number', 0, 0),
+ 'paraCont.stemID' : (1, 'number', 0, 0),
+ 'paraCont.stemPage' : (1, 'number', 0, 0),
+
+ 'paraStems' : (0, 'number', 1, 1),
+ 'paraStems.stemID' : (1, 'number', 0, 0),
+
+ 'wordStems' : (0, 'number', 1, 1),
+ 'wordStems.stemID' : (1, 'number', 0, 0),
+
+ 'page' : (1, 'snippets', 1, 0),
+ 'page.pageid' : (1, 'scalar_text', 0, 0),
+ 'page.pagelabel' : (1, 'scalar_text', 0, 0),
+ 'page.type' : (1, 'scalar_text', 0, 0),
+ 'page.h' : (1, 'scalar_number', 0, 0),
+ 'page.w' : (1, 'scalar_number', 0, 0),
+ 'page.startID' : (1, 'scalar_number', 0, 0),
+
+ 'group' : (1, 'snippets', 1, 0),
+ 'group.type' : (1, 'scalar_text', 0, 0),
+
+ 'region' : (1, 'snippets', 1, 0),
+ 'region.type' : (1, 'scalar_text', 0, 0),
+ 'region.x' : (1, 'scalar_number', 0, 0),
+ 'region.y' : (1, 'scalar_number', 0, 0),
+ 'region.h' : (1, 'scalar_number', 0, 0),
+ 'region.w' : (1, 'scalar_number', 0, 0),
+
+ 'img' : (1, 'snippets', 1, 0),
+ 'img.x' : (1, 'scalar_number', 0, 0),
+ 'img.y' : (1, 'scalar_number', 0, 0),
+ 'img.h' : (1, 'scalar_number', 0, 0),
+ 'img.w' : (1, 'scalar_number', 0, 0),
+ 'img.src' : (1, 'scalar_number', 0, 0),
+
+ 'paragraph' : (1, 'snippets', 1, 0),
+ 'paragraph.class' : (1, 'scalar_text', 0, 0),
+ 'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
+ 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
+
+ 'word_semantic' : (1, 'snippets', 1, 1),
+ 'word_semantic.type' : (1, 'scalar_text', 0, 0),
+ 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
+ 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
+
+ 'word' : (1, 'snippets', 1, 0),
+ 'word.type' : (1, 'scalar_text', 0, 0),
+ 'word.class' : (1, 'scalar_text', 0, 0),
+
+ '_span' : (1, 'snippets', 1, 0),
+ '_span.firstWord' : (1, 'scalar_number', 0, 0),
+ '-span.lastWord' : (1, 'scalar_number', 0, 0),
+
+ 'extratokens' : (1, 'snippets', 1, 0),
+ 'extratokens.type' : (1, 'scalar_text', 0, 0),
+ 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
+ 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
+
'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0),
- 'sh' : (1, 'number', 0, 0),
- 'word' : (0, 'number', 1, 1),
- 'src' : (1, 'scalar_number', 0, 0),
- 'rel' : (1, 'number', 0, 0),
- 'row' : (1, 'number', 0, 0),
- 'startID' : (1, 'number', 0, 1),
+ 'glyph.use' : (1, 'number', 0, 0),
+ 'glyph.vtx' : (1, 'number', 0, 1),
+ 'glyph.len' : (1, 'number', 0, 1),
+ 'glyph.dpi' : (1, 'number', 0, 0),
+ 'vtx' : (0, 'number', 1, 1),
+ 'vtx.x' : (1, 'number', 0, 0),
+ 'vtx.y' : (1, 'number', 0, 0),
+ 'len' : (0, 'number', 1, 1),
+ 'len.n' : (1, 'number', 0, 0),
+
+ 'book' : (1, 'snippets', 1, 0),
+ 'version' : (1, 'snippets', 1, 0),
+ 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'version.Schema_id' : (1, 'scalar_text', 0, 0),
+ 'version.Schema_version' : (1, 'scalar_text', 0, 0),
+ 'version.Topaz_version' : (1, 'scalar_text', 0, 0),
+ 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
+ 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
+ 'version.chapterheaders' : (1, 'scalar_text', 0, 0),
+ 'version.creation_date' : (1, 'scalar_text', 0, 0),
+ 'version.header_footer' : (1, 'scalar_text', 0, 0),
+ 'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
+ 'version.letter_insertion' : (1, 'scalar_text', 0, 0),
+ 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
+ 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
+ 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
+ 'version.findlists' : (1, 'scalar_text', 0, 0),
+ 'version.page_num' : (1, 'scalar_text', 0, 0),
+ 'version.page_type' : (1, 'scalar_text', 0, 0),
+
+ 'stylesheet' : (1, 'snippets', 1, 0),
+ 'style' : (1, 'snippets', 1, 0),
+ 'style._tag' : (1, 'scalar_text', 0, 0),
+ 'style.type' : (1, 'scalar_text', 0, 0),
+ 'style._parent_type' : (1, 'scalar_text', 0, 0),
+ 'style.class' : (1, 'scalar_text', 0, 0),
+ 'style._after_class' : (1, 'scalar_text', 0, 0),
+ 'rule' : (1, 'snippets', 1, 0),
+ 'rule.attr' : (1, 'scalar_text', 0, 0),
+ 'rule.value' : (1, 'scalar_text', 0, 0),
+
+ 'original' : (0, 'number', 1, 1),
+ 'original.pnum' : (1, 'number', 0, 0),
+ 'original.pid' : (1, 'text', 0, 0),
+ 'pages' : (0, 'number', 1, 1),
+ 'pages.ref' : (1, 'number', 0, 0),
+ 'pages.id' : (1, 'number', 0, 0),
+ 'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0),
- 'glyphID' : (1, 'number', 0, 0),
- 'rootID' : (1, 'number', 0, 0),
- 'stemID' : (1, 'number', 0, 0),
- 'margin-top' : (1, 'number', 0, 0),
- 'stemPage' : (1, 'number', 0, 0),
- 'dehyphen' : (1, 'number', 1, 1),
- 'rootID' : (1, 'number', 0, 0),
- 'paraCont' : (1, 'number', 1, 1),
- 'paraStems' : (1, 'number', 1, 1),
- 'wordStems' : (1, 'number', 1, 1),
- 'original' : (0, 'number', 0, 1),
- 'use' : (1, 'number', 0, 0),
- 'vtx' : (1, 'number', 0, 1),
- 'len' : (1, 'number', 0, 1),
- 'dpi' : (1, 'number', 0, 0),
- 'n' : (1, 'number', 0, 0),
- 'id' : (1, 'number', 0, 0),
- 'ref' : (1, 'number', 0, 0),
- 'pnum' : (1, 'number', 0, 0),
- 'pid' : (1, 'text', 0, 0),
- 'info' : (0, 'number', 1, 0),
- 'bl' : (1, 'raw', 0, 0),
- 'firstGlyph' : (1, 'raw', 0, 0),
- 'lastGlyph' : (1, 'raw', 0, 0),
- 'ocrText' : (1, 'text', 0, 0),
- 'title' : (1, 'text', 0, 0),
- 'href' : (1, 'text', 0, 0),
- '_parent_type' : (1, 'text', 0, 0),
- 'attr' : (1, 'scalar_text', 0, 0),
- 'justify' : (1, 'scalar_text', 0, 0),
- 'align' : (1, 'scalar_text', 0, 0),
- 'layout' : (1, 'scalar_text', 0, 0),
- 'pageid' : (1, 'scalar_text', 0, 0),
- 'pagelabel' : (1, 'scalar_text', 0, 0),
- 'type' : (1, 'text', 0, 0),
- 'class' : (1, 'scalar_text', 0, 0),
- 'container' : (1, 'scalar_text', 0, 0),
- '_after_class' : (1, 'scalar_text', 0, 0),
- '_tag' : (1, 'scalar_text', 0, 0),
- 'pos' : (1, 'scalar_text', 0, 0),
- 'page_num' : (1, 'scalar_text', 0, 0),
- 'page_type' : (1, 'scalar_text', 0, 0),
- 'findlists' : (1, 'scalar_text', 0, 0),
- 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'Schema_id' : (1, 'scalar_text', 0, 0),
- 'Schema_version' : (1, 'scalar_text', 0, 0),
- 'Topaz_version' : (1, 'scalar_text', 0, 0),
- 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
- 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
- 'chapterheaders' : (1, 'scalar_text', 0, 0),
- 'creation_date' : (1, 'scalar_text', 0, 0),
- 'header_footer' : (1, 'scalar_text', 0, 0),
- 'init_from_ocr' : (1, 'scalar_text', 0, 0),
- 'letter_insertion' : (1, 'scalar_text', 0, 0),
- 'xmlinj_convert' : (1, 'scalar_text', 0, 0),
- 'xmlinj_reflow' : (1, 'scalar_text', 0, 0),
- 'xmlinj_transform' : (1, 'scalar_text', 0, 0),
+ 'startID.id' : (1, 'number', 0, 0),
+
}
return
- # loop: pass though values unchanged
- # DO NOT CHANGE - this has proven to be correct
- def doLoop76Mode0(self, argtype, cnt):
- result = []
- for i in xrange(cnt):
- result.append(self.formatArg(readEncodedNumber(self.fo), argtype))
- return result
-
-
- # loop generating values relative to the *negative*
- # of the offset - don't ask why - it just is
- # DO NOT CHANGE - this has proven to be correct
- def doLoop76Mode1(self, argtype, cnt):
- result = []
- offset = -readEncodedNumber(self.fo)
- for i in xrange(cnt):
- val = readEncodedNumber(self.fo) + offset
- result.append(self.formatArg(val, argtype))
- return result
-
-
- # loop generating values with starting value and accumulation
- # DO NOT CHANGE - this has proven to be the correct
- def doLoop76Mode2(self, argtype, cnt):
- result = []
- ptr = readEncodedNumber(self.fo)
- result.append(self.formatArg(ptr, argtype))
- for i in xrange(cnt-1):
- ptr = ptr + readEncodedNumber(self.fo)
- result.append(self.formatArg(ptr, argtype))
- return result
-
-
- # loop generating values with starting value and accumulation
- # **after** subtracting adjustment value from each
- # DO NOT CHANGE - this has been proven to be correct
- def doLoop76Mode3(self, argtype, cnt):
- result = []
- adj = readEncodedNumber(self.fo)
- ptr = readEncodedNumber(self.fo)
- ptr = ptr - adj
- result.append(self.formatArg(ptr, argtype))
- for i in xrange(cnt-1):
- ptr = ptr + readEncodedNumber(self.fo) - adj
- result.append(self.formatArg(ptr,argtype))
- return result
-
-
- # loop using runing sum of data values and starting value
- # with accumulation to get new value
- # Again, don't ask it took me forever to figure this out
- # DO NOT CHANGE - this has been proven to be correct
- def doLoop76Mode4(self, argtype, cnt):
- result = []
- val = readEncodedNumber(self.fo)
- runsum = val
- ptr = val
- result.append(self.formatArg(ptr, argtype))
- for i in xrange(cnt-1):
- runsum += readEncodedNumber(self.fo)
- ptr = ptr + runsum
- result.append(self.formatArg(ptr,argtype))
- return result
-
- # loop using and extra value as an adjustment
- # and a running sum of the values after subtracting
- # the adjustment, added to a ptr to get a new pointer
- def doLoop76Mode5(self, argtype, cnt):
+ # general loop code gracisouly submitted by "skindle" - thank you!
+ def doLoop76Mode(self, argtype, cnt, mode):
result = []
- adj = readEncodedNumber(self.fo)
- ptr = 0
- runsum = 0
+ adj = 0
+ if mode & 1:
+ adj = readEncodedNumber(self.fo)
+ mode = mode >> 1
+ x = []
for i in xrange(cnt):
- val = readEncodedNumber(self.fo)
- runsum += (val - adj)
- ptr = ptr +runsum
- result.append(self.formatArg(ptr,argtype))
- return result
-
-
- # FIXME: I have only 4 points to work this out with inside my book
- # So may be wrong but it is correct for my 4 points
- def doLoop76Mode6(self, argtype, cnt):
- result = []
- oldval = 0
+ x.append(readEncodedNumber(self.fo) - adj)
+ for i in xrange(mode):
+ for j in xrange(1, cnt):
+ x[j] = x[j] + x[j - 1]
for i in xrange(cnt):
- val = readEncodedNumber(self.fo)
- ptr= (3 * oldval) + val + 1
- result.append(self.formatArg(ptr,argtype))
- oldval = val
+ result.append(self.formatArg(x[i],argtype))
return result
-
# dispatches loop commands bytes with various modes
# The 0x76 style loops are used to build vectors
# since they did not appear in the test cases
def decodeCMD(self, cmd, argtype):
-
- # if (cmd == 0x72):
- # self.doLoop72(argtype)
- # result =[]
- # return result
-
if (cmd == 0x76):
+
# loop with cnt, and mode to control loop styles
cnt = readEncodedNumber(self.fo)
mode = readEncodedNumber(self.fo)
- if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
-
- if (mode == 0x00):
- return self.doLoop76Mode0(argtype, cnt)
-
- elif (mode == 0x01):
- return self.doLoop76Mode1(argtype, cnt)
-
- elif (mode == 0x02):
- return self.doLoop76Mode2(argtype, cnt)
-
- elif (mode == 0x03):
- return self.doLoop76Mode3(argtype, cnt)
-
- elif (mode == 0x04):
- return self.doLoop76Mode4(argtype, cnt)
-
- elif (mode == 0x05):
- return self.doLoop76Mode5(argtype, cnt)
-
- elif (mode == 0x06):
- return self.doLoop76Mode6(argtype, cnt)
-
- else:
-
- if self.debug :
- # try to mark any unknown loop comands
- # if they exist, unless they are used to process
- # text or some other known list, we won't be able to prove them correct
- print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode)
- for i in xrange(cnt):
- val = readEncodedNumber(self.fo)
- print ' 0x%x' % val,
- print ' '
- result = []
- return result
+ if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
+ return self.doLoop76Mode(argtype, cnt, mode)
if self.dbug: print "Unknown command", cmd
result = []
return result
+
+
# add full tag path to injected snippets
def updateName(self, tag, prefix):
self.doc.append(tag)
else:
if self.debug:
- print "Mina Loop: Unknown value: %x" % v
+ print "Main Loop: Unknown value: %x" % v
# now do snippet injection
class DocParser(object):
- def __init__(self, flatxml, fileid):
+ def __init__(self, flatxml, classlst, fileid):
self.id = os.path.basename(fileid).replace('.dat','')
self.flatdoc = flatxml.split('\n')
+ self.classList = {}
+ tmpList = classlst.split('\n')
+ for pclass in tmpList:
+ if pclass != '':
+ # remove the leading period from the css name
+ cname = pclass[1:]
+ self.classList[cname] = True
self.ocrtext = []
self.link_id = []
self.link_title = []
self.paracont_stemid = []
self.parastems_stemid = []
+ # find tag if within pos to end inclusive
+ def lineinDoc(self, pos) :
+ docList = self.flatdoc
+ cnt = len(docList)
+ if (pos >= 0) and (pos < cnt) :
+ item = docList[pos]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=',1)
+ else :
+ name = item
+ argres = ''
+ return name, argres
# find tag if within pos to end inclusive
return startpos
- # get a description of the paragraph
+ # build a description of the paragraph
def getParaDescription(self, start, end):
+
+ result = []
+
# normal paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
- # class names are an issue given topaz starts them with numerals (not allowed)
- # use a mix of cases, (which cause some browsers problems), and actually
- # attach numbers after "reclustered*" to the end to deal with reflow issues
- # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered
+ # class names are an issue given topaz may start them with numerals (not allowed),
+ # use a mix of cases (which cause some browsers problems), and actually
+ # attach numbers after "_reclustered*" to the end to deal with reflow issues
+ # but then not actually provide all of these _reclustereed classes in the stylesheet!
+
+ # so we clean this up by lowercasing, prepend 'cl_', and if not in the class
+ # list from the stylesheet, trying once more with "_reclustered*" removed
+ # if still not in stylesheet, let it pass as is
pclass = pclass.lower()
pclass = 'cl_' + pclass
- p = pclass.find('reclustered')
- if p > 0 : pclass = pclass[0:p+11]
-
+ if pclass not in self.classList:
+ p = pclass.find('_reclustered')
+ if p > 0 :
+ baseclass = pclass[0:p]
+ if baseclass in self.classList:
+ pclass = baseclass
+
+ # build up a description of the paragraph in result and return it
+ # first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) :
- return pclass, int(sfirst), int(slast)
-
- # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well
- # so walk through this region keeping track of the first firstword, and the last lastWord
- # on any items that have it
- (pos, sfirst) = self.findinDoc('firstWord',start, end)
- first = int(sfirst)
- last = -1
- for i in xrange(pos+1,end):
- (pos, slast) = self.findinDoc('lastWord',i,i+1)
- if slast != None:
- last = int(slast)
- return pclass, first, last
-
-
- def buildParagraph(self, cname, first, last, type, regtype) :
+ first = int(sfirst)
+ last = int(slast)
+ for wordnum in xrange(first, last):
+ result.append(('ocr', wordnum))
+ return pclass, result
+
+ # this type of paragrph may be made up of multiple _spans, inline
+ # word monograms (images) and words with semantic meaning
+
+ # need to parse this type line by line
+ line = start + 1
+ word_class = ''
+
+ while (line < end) :
+
+ (name, argres) = self.lineinDoc(line)
+
+ if name.endswith('_span.firstWord') :
+ first = int(argres)
+ (name, argres) = self.lineinDoc(line+1)
+ if not name.endswith('_span.lastWord'):
+ print 'Error: - incorrect _span ordering inside paragraph'
+ last = int(argres)
+ for wordnum in xrange(first, last):
+ result.append(('ocr', wordnum))
+ line += 1
+
+ elif name.endswith('word.class'):
+ (cname, space) = argres.split('-',1)
+ if cname == 'spaceafter':
+ word_class = 'sa'
+
+ elif name.endswith('word.img.src'):
+ result.append(('img' + word_class, int(argres)))
+ word_class = ''
+
+ elif name.endswith('word_semantic.firstWord'):
+ first = int(argres)
+ (name, argres) = self.lineinDoc(line+1)
+ if not name.endswith('word_semantic.lastWord'):
+ print 'Error: - incorrect word_semantic ordering inside paragraph'
+ last = int(argres)
+ for wordnum in xrange(first, last):
+ result.append(('ocr', wordnum))
+ line += 1
+
+ line += 1
+
+ return pclass, result
+
+
+ def buildParagraph(self, cname, pdesc, type, regtype) :
parares = ''
sep =''
+
br_lb = False
if (regtype == 'fixed') or (regtype == 'chapterheading') :
br_lb = True
+
handle_links = False
if len(self.link_id) > 0:
handle_links = True
+
if (type == 'full') or (type == 'begin') :
parares += '<p class="' + cname + '">'
+
if (type == 'end'):
parares += ' '
- for j in xrange(first, last) :
- word = self.ocrtext[j]
- sep = ' '
-
- if handle_links:
- link = self.link_id[j]
- if (link > 0):
- title = self.link_title[link-1]
- if title == "": title='_link_'
- ptarget = self.link_page[link-1] - 1
- linkhtml = '<a href="#page%04d">' % ptarget
- linkhtml += title + '</a>'
- pos = parares.rfind(title)
- if pos >= 0:
- parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
+
+ cnt = len(pdesc)
+
+ for j in xrange( 0, cnt) :
+
+ (wtype, num) = pdesc[j]
+
+ if wtype == 'ocr' :
+ word = self.ocrtext[num]
+ sep = ' '
+
+ if handle_links:
+ link = self.link_id[num]
+ if (link > 0):
+ title = self.link_title[link-1]
+ if title == "": title='_link_'
+ ptarget = self.link_page[link-1] - 1
+ linkhtml = '<a href="#page%04d">' % ptarget
+ linkhtml += title + '</a>'
+ pos = parares.rfind(title)
+ if pos >= 0:
+ parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
+ else :
+ parares += linkhtml
+ if word == '_link_' : word = ''
+ elif (link < 0) :
+ if word == '_link_' : word = ''
+
+ if word == '_lb_':
+ if (num-1) in self.dehyphen_rootid :
+ word = ''
+ sep = ''
+ elif handle_links :
+ word = ''
+ sep = ''
+ elif br_lb :
+ word = '<br />\n'
+ sep = ''
else :
- parares += linkhtml
- if word == '_link_' : word = ''
- elif (link < 0) :
- if word == '_link_' : word = ''
-
- if word == '_lb_':
- if (j-1) in self.dehyphen_rootid :
- word = ''
- sep = ''
- elif handle_links :
- word = ''
- sep = ''
- elif br_lb :
- word = '<br />\n'
- sep = ''
- else :
- word = '\n'
+ word = '\n'
+ sep = ''
+
+ if num in self.dehyphen_rootid :
+ word = word[0:-1]
sep = ''
- if j in self.dehyphen_rootid :
- word = word[0:-1]
+ parares += word + sep
+
+ elif wtype == 'img' :
sep = ''
+ parares += '<img src="img/img%04d.jpg" alt="" />' % num
+ parares += sep
- parares += word + sep
+ elif wtype == 'imgsa' :
+ sep = ' '
+ parares += '<img src="img/img%04d.jpg" alt="" />' % num
+ parares += sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' :
- (pclass, first, last) = self.getParaDescription(start,end)
+ (pclass, pdesc) = self.getParaDescription(start,end)
if not breakSet:
htmlpage += '<div style="page-break-after: always;"> </div>\n'
breakSet = True
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">'
- htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype)
+ htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
anchorSet = True
- (pclass, first, last) = self.getParaDescription(start,end)
+ (pclass, pdesc) = self.getParaDescription(start,end)
if ptype == 'full' :
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">'
- htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype)
+ htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
else :
- htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+ htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') :
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
anchorSet = True
- (pclass, first, last) = self.getParaDescription(start,end)
- htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype)
+ (pclass, pdesc) = self.getParaDescription(start,end)
+ htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+
+ elif regtype == 'synth_fcvr.center' :
+ if not anchorSet:
+ htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
+ anchorSet = True
+ (pos, simgsrc) = self.findinDoc('img.src',start,end)
+ if simgsrc:
+ htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else :
- print 'Unknown region type', regtype
- print 'Warning: skipping this region'
+ print 'Warning: Unknown region type', regtype
+ print 'Treating this like a "fixed" region'
+ regtype = 'fixed'
+ ptype = 'full'
+ # check to see if this is a continution from the previous page
+ if (len(self.parastems_stemid) > 0):
+ ptype = 'end'
+ self.parastems_stemid=[]
+ else:
+ if not anchorSet:
+ htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '"> </div>\n'
+ anchorSet = True
+ (pclass, desc) = self.getParaDescription(start,end)
+ if ptype == 'full' :
+ tag = 'p'
+ if pclass[3:6] == 'h1-' : tag = 'h4'
+ if pclass[3:6] == 'h2-' : tag = 'h5'
+ if pclass[3:6] == 'h3-' : tag = 'h6'
+ htmlpage += '<' + tag + ' class="' + pclass + '">'
+ htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
+ htmlpage += '</' + tag + '>'
+ else :
+ htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+
+
if len(self.paracont_stemid) > 0 :
if htmlpage[-4:] == '</p>':
-def convert2HTML(flatxml, fileid):
+def convert2HTML(flatxml, classlst, fileid):
# create a document parser
- dp = DocParser(flatxml, fileid)
+ dp = DocParser(flatxml, classlst, fileid)
htmlpage = dp.process()
class GParser(object):
- def __init__(self, flatxml):
- self.flatdoc = flatxml.split('\n')
- self.dpi = 1440
- self.gh = self.getData('info.glyph.h')
- self.gw = self.getData('info.glyph.w')
- self.guse = self.getData('info.glyph.use')
- self.count = len(self.guse)
- self.gvtx = self.getData('info.glyph.vtx')
- self.glen = self.getData('info.glyph.len')
- self.gdpi = self.getData('info.glyph.dpi')
- self.vx = self.getData('info.vtx.x')
- self.vy = self.getData('info.vtx.y')
- self.vlen = self.getData('info.len.n')
- self.glen.append(len(self.vlen))
- self.gvtx.append(len(self.vx))
-
- def getData(self, path):
- result = None
- cnt = len(self.flatdoc)
- for j in xrange(cnt):
- item = self.flatdoc[j]
- if item.find('=') >= 0:
- (name, argt) = item.split('=')
- argres = argt.split('|')
- else:
- name = item
- argres = []
- if (name == path):
- result = argres
- break
- if (len(argres) > 0) :
- for j in xrange(0,len(argres)):
- argres[j] = int(argres[j])
- return result
-
- def getPath(self, gly):
- path = ''
- if (gly < 0) or (gly >= self.count):
- return path
- tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
- ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
- p = 0
- for k in xrange(self.glen[gly], self.glen[gly+1]):
- if (p == 0):
- zx = tx[0:self.vlen[k]+1]
- zy = ty[0:self.vlen[k]+1]
- else:
- zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
- zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
- p += 1
- for j in xrange(0, len(zx)):
- if (j == 0):
- path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
- else:
- path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
- path += 'z'
- return path
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+ self.dpi = 1440
+ self.gh = self.getData('info.glyph.h')
+ self.gw = self.getData('info.glyph.w')
+ self.guse = self.getData('info.glyph.use')
+ self.count = len(self.guse)
+ self.gvtx = self.getData('info.glyph.vtx')
+ self.glen = self.getData('info.glyph.len')
+ self.gdpi = self.getData('info.glyph.dpi')
+ self.vx = self.getData('info.vtx.x')
+ self.vy = self.getData('info.vtx.y')
+ self.vlen = self.getData('info.len.n')
+ self.glen.append(len(self.vlen))
+ self.gvtx.append(len(self.vx))
+
+ def getData(self, path):
+ result = None
+ cnt = len(self.flatdoc)
+ for j in xrange(cnt):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name == path):
+ result = argres
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getPath(self, gly):
+ path = ''
+ if (gly < 0) or (gly >= self.count):
+ return path
+ tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
+ ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
+ p = 0
+ for k in xrange(self.glen[gly], self.glen[gly+1]):
+ if (p == 0):
+ zx = tx[0:self.vlen[k]+1]
+ zy = ty[0:self.vlen[k]+1]
+ else:
+ zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
+ zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
+ p += 1
+ j = 0
+ while ( j < len(zx) ):
+ if (j == 0):
+ # Start Position.
+ path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
+ elif (j <= len(zx)-3):
+ # Cubic Bezier Curve
+ path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly])
+ j += 2
+ elif (j == len(zx)-2):
+ # Cubic Bezier Curve to Start Position
+ path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
+ j += 1
+ elif (j == len(zx)-1):
+ # Quadratic Bezier Curve to Start Position
+ path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
+
+ j += 1
+ path += 'z'
+ return path
class PParser(object):
- def __init__(self, flatxml):
- self.flatdoc = flatxml.split('\n')
- self.temp = []
- self.ph = self.getData('page.h')[0]
- self.pw = self.getData('page.w')[0]
- self.gx = self.getData('info.glyph.x')
- self.gy = self.getData('info.glyph.y')
- self.gid = self.getData('info.glyph.glyphID')
-
- def getData(self, path):
- result = None
- cnt = len(self.flatdoc)
- for j in xrange(cnt):
- item = self.flatdoc[j]
- if item.find('=') >= 0:
- (name, argt) = item.split('=')
- argres = argt.split('|')
- else:
- name = item
- argres = []
- if (name.endswith(path)):
- result = argres
- break
- if (len(argres) > 0) :
- for j in xrange(0,len(argres)):
- argres[j] = int(argres[j])
- return result
-
- def getDataTemp(self, path):
- result = None
- cnt = len(self.temp)
- for j in xrange(cnt):
- item = self.temp[j]
- if item.find('=') >= 0:
- (name, argt) = item.split('=')
- argres = argt.split('|')
- else:
- name = item
- argres = []
- if (name.endswith(path)):
- result = argres
- self.temp.pop(j)
- break
- if (len(argres) > 0) :
- for j in xrange(0,len(argres)):
- argres[j] = int(argres[j])
- return result
-
- def getImages(self):
- result = []
- self.temp = self.flatdoc
- while (self.getDataTemp('region.img') != None):
- h = self.getDataTemp('region.img.h')[0]
- w = self.getDataTemp('region.img.w')[0]
- x = self.getDataTemp('region.img.x')[0]
- y = self.getDataTemp('region.img.y')[0]
- src = self.getDataTemp('region.img.src')[0]
- result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
- return result
-
- def getGlyphs(self,glyfname):
- result = []
- if (self.gid != None) and (len(self.gid) > 0):
- glyphs = []
- for j in set(self.gid):
- glyphs.append(j)
- glyphs.sort()
- gfile = open(glyfname, 'r')
- j = 0
- while True :
- inp = gfile.readline()
- if (inp == ''):
- break
- id='id="gl%d"' % glyphs[j]
- if (inp.find(id) > 0):
- result.append(inp)
- j += 1
- if (j == len(glyphs)):
- break
- gfile.close()
- return result
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+ self.temp = []
+ foo = self.getData('page.h') or self.getData('book.h')
+ self.ph = foo[0]
+ foo = self.getData('page.w') or self.getData('book.w')
+ self.pw = foo[0]
+ self.gx = self.getData('info.glyph.x')
+ self.gy = self.getData('info.glyph.y')
+ self.gid = self.getData('info.glyph.glyphID')
+
+ def getData(self, path):
+ result = None
+ cnt = len(self.flatdoc)
+ for j in xrange(cnt):
+ item = self.flatdoc[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name.endswith(path)):
+ result = argres
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getDataTemp(self, path):
+ result = None
+ cnt = len(self.temp)
+ for j in xrange(cnt):
+ item = self.temp[j]
+ if item.find('=') >= 0:
+ (name, argt) = item.split('=')
+ argres = argt.split('|')
+ else:
+ name = item
+ argres = []
+ if (name.endswith(path)):
+ result = argres
+ self.temp.pop(j)
+ break
+ if (len(argres) > 0) :
+ for j in xrange(0,len(argres)):
+ argres[j] = int(argres[j])
+ return result
+
+ def getImages(self):
+ result = []
+ self.temp = self.flatdoc
+ while (self.getDataTemp('img') != None):
+ h = self.getDataTemp('img.h')[0]
+ w = self.getDataTemp('img.w')[0]
+ x = self.getDataTemp('img.x')[0]
+ y = self.getDataTemp('img.y')[0]
+ src = self.getDataTemp('img.src')[0]
+ result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
+ return result
+
+ def getGlyphs(self,glyfname):
+ result = []
+ if (self.gid != None) and (len(self.gid) > 0):
+ glyphs = []
+ for j in set(self.gid):
+ glyphs.append(j)
+ glyphs.sort()
+ gfile = open(glyfname, 'r')
+ j = 0
+ while True :
+ inp = gfile.readline()
+ if (inp == ''):
+ break
+ id='id="gl%d"' % glyphs[j]
+ if (inp.find(id) > 0):
+ result.append(inp)
+ j += 1
+ if (j == len(glyphs)):
+ break
+ gfile.close()
+ return result
def usage():
- print 'Usage: '
- print ' '
- print ' gensvg.py unencryptedBookDir'
- print ' '
+ print 'Usage: '
+ print ' '
+ print ' gensvg.py unencryptedBookDir'
+ print ' '
def main(argv):
- bookDir = ''
-
- if len(argv) == 0:
- argv = sys.argv
- else :
- argv = argv.split()
-
- try:
- opts, args = getopt.getopt(argv[1:], "h:")
-
- except getopt.GetoptError, err:
- print str(err)
- usage()
- sys.exit(2)
-
- if len(opts) == 0 and len(args) == 0 :
- usage()
- sys.exit(2)
-
- for o, a in opts:
- if o =="-h":
- usage()
- sys.exit(0)
-
- bookDir = args[0]
-
- if not os.path.exists(bookDir) :
- print "Can not find directory with unencrypted book"
- sys.exit(-1)
-
- dictFile = os.path.join(bookDir,'dict0000.dat')
-
- if not os.path.exists(dictFile) :
- print "Can not find dict0000.dat file"
- sys.exit(-1)
-
- pageDir = os.path.join(bookDir,'page')
- if not os.path.exists(pageDir) :
- print "Can not find page directory in unencrypted book"
- sys.exit(-1)
-
- imgDir = os.path.join(bookDir,'img')
- if not os.path.exists(imgDir) :
- print "Can not find image directory in unencrypted book"
- sys.exit(-1)
-
- glyphsDir = os.path.join(bookDir,'glyphs')
- if not os.path.exists(glyphsDir) :
- print "Can not find glyphs directory in unencrypted book"
- sys.exit(-1)
-
- metaFile = os.path.join(bookDir,'metadata0000.dat')
- if not os.path.exists(metaFile) :
- print "Can not find metadata0000.dat in unencrypted book"
- sys.exit(-1)
-
- svgDir = os.path.join(bookDir,'svg')
- if not os.path.exists(svgDir) :
- os.makedirs(svgDir)
-
-
- print 'Processing Meta Data ... '
-
- print ' ', 'metadata0000.dat'
- fname = os.path.join(bookDir,'metadata0000.dat')
- metadata = decode_meta.getMetaArray(fname)
-
- print 'Processing Glyphs ... '
-
- filenames = os.listdir(glyphsDir)
- filenames = sorted(filenames)
-
- glyfname = os.path.join(svgDir,'glyphs.svg')
- glyfile = open(glyfname, 'w')
- glyfile.write('<?xml version="1.0" standalone="no"?>\n')
- glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
- glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
- glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
- glyfile.write('<defs>\n')
- counter = 0
- for filename in filenames:
- print ' ', filename
- fname = os.path.join(glyphsDir,filename)
- flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
- gp = GParser(flat_xml)
- for i in xrange(0, gp.count):
- path = gp.getPath(i)
- glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
- counter += 1
- glyfile.write('</defs>\n')
- glyfile.write('</svg>\n')
- glyfile.close()
-
- print 'Processing Pages ... '
-
- scaledpi = 720
- filenames = os.listdir(pageDir)
- filenames = sorted(filenames)
- counter = 0
- for filename in filenames:
- print ' ', filename
- fname = os.path.join(pageDir,filename)
- flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
- pp = PParser(flat_xml)
- pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
- pfile.write('<?xml version="1.0" standalone="no"?>\n')
- pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
- pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
- pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
- if (pp.gid != None):
- pfile.write('<defs>\n')
- gdefs = pp.getGlyphs(glyfname)
- for j in xrange(0,len(gdefs)):
- pfile.write(gdefs[j])
- pfile.write('</defs>\n')
- for j in xrange(0,len(pp.gid)):
- pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
- img = pp.getImages()
- if (img != None):
- for j in xrange(0,len(img)):
- pfile.write(img[j])
- pfile.write('</svg>')
- pfile.close()
- counter += 1
-
- print 'Processing Complete'
-
- return 0
+ bookDir = ''
+
+ if len(argv) == 0:
+ argv = sys.argv
+ else :
+ argv = argv.split()
+
+ try:
+ opts, args = getopt.getopt(argv[1:], "h:")
+
+ except getopt.GetoptError, err:
+ print str(err)
+ usage()
+ sys.exit(2)
+
+ if len(opts) == 0 and len(args) == 0 :
+ usage()
+ sys.exit(2)
+
+ for o, a in opts:
+ if o =="-h":
+ usage()
+ sys.exit(0)
+
+ bookDir = args[0]
+
+ if not os.path.exists(bookDir) :
+ print "Can not find directory with unencrypted book"
+ sys.exit(-1)
+
+ dictFile = os.path.join(bookDir,'dict0000.dat')
+
+ if not os.path.exists(dictFile) :
+ print "Can not find dict0000.dat file"
+ sys.exit(-1)
+
+ pageDir = os.path.join(bookDir,'page')
+ if not os.path.exists(pageDir) :
+ print "Can not find page directory in unencrypted book"
+ sys.exit(-1)
+
+ imgDir = os.path.join(bookDir,'img')
+ if not os.path.exists(imgDir) :
+ print "Can not find image directory in unencrypted book"
+ sys.exit(-1)
+
+ glyphsDir = os.path.join(bookDir,'glyphs')
+ if not os.path.exists(glyphsDir) :
+ print "Can not find glyphs directory in unencrypted book"
+ sys.exit(-1)
+
+ metaFile = os.path.join(bookDir,'metadata0000.dat')
+ if not os.path.exists(metaFile) :
+ print "Can not find metadata0000.dat in unencrypted book"
+ sys.exit(-1)
+
+ svgDir = os.path.join(bookDir,'svg')
+ if not os.path.exists(svgDir) :
+ os.makedirs(svgDir)
+
+
+ print 'Processing Meta Data ... '
+
+ print ' ', 'metadata0000.dat'
+ fname = os.path.join(bookDir,'metadata0000.dat')
+ metadata = decode_meta.getMetaArray(fname)
+
+ print 'Processing Glyphs ... '
+
+ filenames = os.listdir(glyphsDir)
+ filenames = sorted(filenames)
+
+ glyfname = os.path.join(svgDir,'glyphs.svg')
+ glyfile = open(glyfname, 'w')
+ glyfile.write('<?xml version="1.0" standalone="no"?>\n')
+ glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
+ glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
+ glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
+ glyfile.write('<defs>\n')
+ counter = 0
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(glyphsDir,filename)
+ flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ gp = GParser(flat_xml)
+ for i in xrange(0, gp.count):
+ path = gp.getPath(i)
+ glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
+ counter += 1
+ glyfile.write('</defs>\n')
+ glyfile.write('</svg>\n')
+ glyfile.close()
+
+ print 'Processing Pages ... '
+
+ scaledpi = 720
+ filenames = os.listdir(pageDir)
+ filenames = sorted(filenames)
+ counter = 0
+ for filename in filenames:
+ print ' ', filename
+ fname = os.path.join(pageDir,filename)
+ flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ pp = PParser(flat_xml)
+ pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
+ pfile.write('<?xml version="1.0" standalone="no"?>\n')
+ pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
+ pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
+ pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
+ if (pp.gid != None):
+ pfile.write('<defs>\n')
+ gdefs = pp.getGlyphs(glyfname)
+ for j in xrange(0,len(gdefs)):
+ pfile.write(gdefs[j])
+ pfile.write('</defs>\n')
+ for j in xrange(0,len(pp.gid)):
+ pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
+ img = pp.getImages()
+ if (img != None):
+ for j in xrange(0,len(img)):
+ pfile.write(img[j])
+ pfile.write('</svg>')
+ pfile.close()
+ counter += 1
+
+ print 'Processing Complete'
+
+ return 0
if __name__ == '__main__':
- sys.exit(main(''))
+ sys.exit(main(''))
\ No newline at end of file