# Revision history:
# 0.1 - Initial alpha testing release 2020 by Pu D. Pud
# 0.2 - Python 3 for calibre 5.0 (in testing)
+# 0.3 - More Python3 fixes
"""
"""
__license__ = 'GPL v3'
-__version__ = "0.2"
+__version__ = "0.3"
import sys
import os
import zlib
import struct
import hashlib
-from decimal import *
-from itertools import chain, islice
+from io import BytesIO
+from decimal import Decimal
+import itertools
import xml.etree.ElementTree as etree
# Wrap a stream so that output gets flushed immediately
if self.encoding == None:
self.encoding = "utf-8"
def write(self, data):
- if isinstance(data, str):
+ if isinstance(data,str) or isinstance(data,unicode):
+ # str for Python3, unicode for Python2
data = data.encode(self.encoding,"replace")
- self.stream.buffer.write(data)
- self.stream.buffer.flush()
-
+ try:
+ buffer = getattr(self.stream, 'buffer', self.stream)
+ # self.stream.buffer for Python3, self.stream for Python2
+ buffer.write(data)
+ buffer.flush()
+ except:
+ # We can do nothing if a write fails
+ raise
def __getattr__(self, attr):
return getattr(self.stream, attr)
return ["ignoblepdf.py"]
else:
argvencoding = sys.stdin.encoding or "utf-8"
- return [arg if isinstance(arg, str) else str(arg, argvencoding) for arg in sys.argv]
+ return [arg if (isinstance(arg, str) or isinstance(arg,unicode)) else str(arg, argvencoding) for arg in sys.argv]
class IGNOBLEError(Exception):
ARC4, AES = _load_crypto()
-from io import BytesIO
# Do we generate cross reference streams on output?
elif l == 2:
return struct.unpack('>H', s)[0]
elif l == 3:
- return struct.unpack('>L', '\x00'+s)[0]
+ return struct.unpack('>L', bytes([0]) + s)[0]
elif l == 4:
return struct.unpack('>L', s)[0]
else:
Use PSKeywordTable.intern() instead.
'''
def __init__(self, name):
- self.name = name
+ self.name = name.decode('utf-8')
return
def __repr__(self):
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
-KEYWORD_BRACE_BEGIN = KWD('{')
-KEYWORD_BRACE_END = KWD('}')
-KEYWORD_ARRAY_BEGIN = KWD('[')
-KEYWORD_ARRAY_END = KWD(']')
-KEYWORD_DICT_BEGIN = KWD('<<')
-KEYWORD_DICT_END = KWD('>>')
+KEYWORD_BRACE_BEGIN = KWD(b'{')
+KEYWORD_BRACE_END = KWD(b'}')
+KEYWORD_ARRAY_BEGIN = KWD(b'[')
+KEYWORD_ARRAY_END = KWD(b']')
+KEYWORD_DICT_BEGIN = KWD(b'<<')
+KEYWORD_DICT_END = KWD(b'>>')
def literal_name(x):
## PSBaseParser
##
-EOL = re.compile(r'[\r\n]')
-SPC = re.compile(r'\s')
-NONSPC = re.compile(r'\S')
-HEX = re.compile(r'[0-9a-fA-F]')
-END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]')
-END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]')
-HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
-END_NUMBER = re.compile(r'[^0-9]')
-END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
-END_STRING = re.compile(r'[()\134]')
-OCT_STRING = re.compile(r'[0-7]')
-ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
+EOL = re.compile(br'[\r\n]')
+SPC = re.compile(br'\s')
+NONSPC = re.compile(br'\S')
+HEX = re.compile(br'[0-9a-fA-F]')
+END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
+END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
+HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
+END_NUMBER = re.compile(br'[^0-9]')
+END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
+END_STRING = re.compile(br'[()\\]')
+OCT_STRING = re.compile(br'[0-7]')
+ESC_STRING = { b'b':8, b't':9, b'n':10, b'f':12, b'r':13, b'(':40, b')':41, b'\\':92 }
class PSBaseParser(object):
self.fp.seek(pos)
# reset the status for nextline()
self.bufpos = pos
- self.buf = ''
+ self.buf = b''
self.charpos = 0
# reset the status for nexttoken()
self.parse1 = self.parse_main
if not m:
return (self.parse_main, len(s))
j = m.start(0)
- c = s[j]
+ if isinstance(s[j], str):
+ # Python 2
+ c = s[j]
+ else:
+ # Python 3
+ c = bytes([s[j]])
self.tokenstart = self.bufpos+j
- if c == '%':
- self.token = '%'
+ if c == b'%':
+ self.token = c
return (self.parse_comment, j+1)
- if c == '/':
- self.token = ''
+ if c == b'/':
+ self.token = b''
return (self.parse_literal, j+1)
- if c in '-+' or c.isdigit():
+ if c in b'-+' or c.isdigit():
self.token = c
return (self.parse_number, j+1)
- if c == '.':
+ if c == b'.':
self.token = c
return (self.parse_decimal, j+1)
if c.isalpha():
self.token = c
return (self.parse_keyword, j+1)
- if c == '(':
- self.token = ''
+ if c == b'(':
+ self.token = b''
self.paren = 1
return (self.parse_string, j+1)
- if c == '<':
- self.token = ''
+ if c == b'<':
+ self.token = b''
return (self.parse_wopen, j+1)
- if c == '>':
- self.token = ''
+ if c == b'>':
+ self.token = b''
return (self.parse_wclose, j+1)
self.add_token(KWD(c))
return (self.parse_main, j+1)
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
- c = s[j]
- if c == '#':
- self.hex = ''
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
+ if c == b'#':
+ self.hex = b''
return (self.parse_literal_hex, j+1)
- self.add_token(LIT(self.token))
+ self.add_token(PSLiteralTable.intern(self.token))
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
- c = s[i]
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
if self.hex:
- self.token += chr(int(self.hex, 16))
+ self.token += bytes([int(self.hex, 16)])
return (self.parse_literal, i)
def parse_number(self, s, i):
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
- c = s[j]
- if c == '.':
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
+ if c == b'.':
self.token += c
return (self.parse_decimal, j+1)
try:
return (self.parse_decimal, len(s))
j = m.start(0)
self.token += s[i:j]
- self.add_token(Decimal(self.token))
+ self.add_token(Decimal(self.token.decode('utf-8')))
return (self.parse_main, j)
def parse_keyword(self, s, i):
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
- c = s[j]
- if c == '\\':
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
+ if c == b'\\':
self.oct = ''
return (self.parse_string_1, j+1)
- if c == '(':
+ if c == b'(':
self.paren += 1
self.token += c
return (self.parse_string, j+1)
- if c == ')':
+ if c == b')':
self.paren -= 1
if self.paren:
self.token += c
self.add_token(self.token)
return (self.parse_main, j+1)
def parse_string_1(self, s, i):
- c = s[i]
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
if self.oct:
- self.token += chr(int(self.oct, 8))
+ self.token += bytes([int(self.oct, 8)])
return (self.parse_string, i)
if c in ESC_STRING:
- self.token += chr(ESC_STRING[c])
+ self.token += bytes([ESC_STRING[c]])
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
- c = s[i]
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
- if c == '<':
+ if c == b'<':
self.add_token(KEYWORD_DICT_BEGIN)
i += 1
return (self.parse_main, i)
def parse_wclose(self, s, i):
- c = s[i]
- if c == '>':
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
+ if c == b'>':
self.add_token(KEYWORD_DICT_END)
i += 1
return (self.parse_main, i)
def parse_hexstring(self, s, i):
- m = END_HEX_STRING.search(s, i)
- if not m:
+ m1 = END_HEX_STRING.search(s, i)
+ if not m1:
self.token += s[i:]
return (self.parse_hexstring, len(s))
- j = m.start(0)
+ j = m1.start(0)
self.token += s[i:j]
- token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
- SPC.sub('', self.token))
+ token = HEX_PAIR.sub(lambda m2: bytes([int(m2.group(0), 16)]),
+ SPC.sub(b'', self.token))
self.add_token(token)
return (self.parse_main, j)
'''
Fetches a next line that ends either with \\r or \\n.
'''
- linebuf = ''
+ linebuf = b''
linepos = self.bufpos + self.charpos
eol = False
while 1:
self.fillbuf()
if eol:
- c = self.buf[self.charpos]
+ c = bytes([self.buf[self.charpos]])
# handle '\r\n'
- if c == '\n':
+ if c == b'\n':
linebuf += c
self.charpos += 1
break
if m:
linebuf += self.buf[self.charpos:m.end(0)]
self.charpos = m.end(0)
- if linebuf[-1] == '\r':
+ if bytes([linebuf[-1]]) == b'\r':
eol = True
else:
break
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
- buf = ''
+ buf = b''
while 0 < pos:
prevpos = pos
pos = max(0, pos-self.BUFSIZ)
s = self.fp.read(prevpos-pos)
if not s: break
while 1:
- n = max(s.rfind('\r'), s.rfind('\n'))
+ n = max(s.rfind(b'\r'), s.rfind(b'\n'))
if n == -1:
buf = s + buf
break
yield s[n:]+buf
s = s[:n]
- buf = ''
+ buf = b''
return
def nextobject(self, direct=False):
'''
- Yields a list of objects: keywords, literals, strings,
+ Yields a list of objects: keywords, literals, strings (byte arrays),
numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries.
'''
if (isinstance(token, int) or
isinstance(token, Decimal) or
isinstance(token, bool) or
+ isinstance(token, bytearray) or
+ isinstance(token, bytes) or
isinstance(token, str) or
isinstance(token, PSLiteral)):
# normal token
return obj
-LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
-LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
-LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
+LITERAL_CRYPT = PSLiteralTable.intern(b'Crypt')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern(b'FlateDecode'), PSLiteralTable.intern(b'Fl'))
+LITERALS_LZW_DECODE = (PSLiteralTable.intern(b'LZWDecode'), PSLiteralTable.intern(b'LZW'))
+LITERALS_ASCII85_DECODE = (PSLiteralTable.intern(b'ASCII85Decode'), PSLiteralTable.intern(b'A85'))
## PDF Objects
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
- for (k,v) in x.iteritems():
+ for (k,v) in iter(x.items()):
x[k] = resolve_all(v)
return x
'''
Recursively decipher X.
'''
- if isinstance(x, str):
+ if isinstance(x, bytearray) or isinstance(x,bytes) or isinstance(x,str):
return decipher(objid, genno, x)
decf = lambda v: decipher_all(decipher, objid, genno, v)
if isinstance(x, list):
x = [decf(v) for v in x]
elif isinstance(x, dict):
- x = dict((k, decf(v)) for (k, v) in x.iteritems())
+ x = dict((k, decf(v)) for (k, v) in iter(x.items()))
return x
def str_value(x):
x = resolve1(x)
- if not isinstance(x, str):
+ if not (isinstance(x, bytearray) or isinstance(x, bytes) or isinstance(x, str)):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
- out = ''
+ out = b''
for c in data:
- if '!' <= c and c <= 'u':
+ if b'!' <= c and c <= b'u':
n += 1
- b = b*85+(ord(c)-33)
+ b = b*85+(c-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
- elif c == 'z':
+ elif c == b'z':
assert n == 0
- out += '\0\0\0\0'
- elif c == '~':
+ out += b'\0\0\0\0'
+ elif c == b'~':
if n:
for _ in range(5-n):
b = b*85+84
cutdiv = len(rawdata) // 16
rawdata = rawdata[:16*cutdiv]
else:
- if eol in ('\r', '\n', '\r\n'):
+ if eol in (b'\r', b'\n', b'\r\n'):
rawdata = rawdata[:length]
self.dic = dic
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
- data = ''.join(LZWDecoder(BytesIO(data)).run())
+ data = b''.join(LZWDecoder(BytesIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFValueError(
'Columns undefined for predictor=12')
columns = int_value(params['Columns'])
- buf = ''
- ent0 = '\x00' * columns
+ buf = b''
+ ent0 = b'\x00' * columns
for i in range(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
- if pred == '\x02':
- ent1 = ''.join(chr((ord(a)+ord(b)) & 255) \
- for (a,b) in zip(ent0,ent1))
+ if pred == 2:
+ ent1 = b''.join(bytes([(a+b) & 255]) \
+ for (a,b) in zip(ent0,ent1))
buf += ent1
ent0 = ent1
data = buf
class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
-LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
-LITERAL_XREF = PSLiteralTable.intern('XRef')
-LITERAL_PAGE = PSLiteralTable.intern('Page')
-LITERAL_PAGES = PSLiteralTable.intern('Pages')
-LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
+LITERAL_OBJSTM = PSLiteralTable.intern(b'ObjStm')
+LITERAL_XREF = PSLiteralTable.intern(b'XRef')
+LITERAL_PAGE = PSLiteralTable.intern(b'Page')
+LITERAL_PAGES = PSLiteralTable.intern(b'Pages')
+LITERAL_CATALOG = PSLiteralTable.intern(b'Catalog')
## XRefs
return '<PDFXRef: objs=%d>' % len(self.offsets)
def objids(self):
- return self.offsets.iterkeys()
+ return iter(self.offsets.keys())
def load(self, parser):
self.offsets = {}
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
- if line.startswith('trailer'):
+ if line.startswith(b'trailer'):
parser.seek(pos)
break
- f = line.strip().split(' ')
+ f = line.strip().split(b' ')
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
- f = line.strip().split(' ')
+ f = line.strip().split(b' ')
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
- if use != 'n': continue
- self.offsets[objid] = (int(genno), int(pos))
+ if use != b'n':
+ continue
+ self.offsets[objid] = (int(genno.decode('utf-8')), int(pos.decode('utf-8')))
self.load_trailer(parser)
return
- KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
+ KEYWORD_TRAILER = PSKeywordTable.intern(b'trailer')
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
index = stream.dic.get('Index', (0,size))
- self.index = zip(islice(index, 0, None, 2),
- islice(index, 1, None, 2))
+ self.index = list(zip(itertools.islice(index, 0, None, 2),
+ itertools.islice(index, 1, None, 2)))
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
# set_parser(parser)
# Associates the document with an (already initialized) parser object.
def set_parser(self, parser):
- if self.parser: return
+ if self.parser:
+ return
self.parser = parser
# The document is set to be temporarily ready during collecting
# all the basic information about the document, e.g.
dict_value(trailer['Encrypt']))
# fix for bad files
except:
- self.encryption = ('ffffffffffffffffffffffffffffffffffff',
+ self.encryption = (b'ffffffffffffffffffffffffffffffffffff',
dict_value(trailer['Encrypt']))
if 'Root' in trailer:
self.set_root(dict_value(trailer['Root']))
break
- else:
- raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
+ else:
+ raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
# The document is set to be non-ready again, until all the
# proper initialization (asking the password key and
# verifying the access permission, so on) is finished.
# Perform the initialization with a given password.
# This step is mandatory even if there's no password associated
# with the document.
- def initialize(self, password=''):
+ def initialize(self, password=b''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
self.ready = True
def genkey_adobe_ps(self, param):
# nice little offline principal keys dictionary
# global static principal key for German Onleihe / Bibliothek Digital
- principalkeys = { 'bibliothek-digital.de': 'rRwGv2tbpKov1krvv7PO0ws9S436/lArPlfipz5Pqhw='.decode('base64')}
+ principalkeys = { b'bibliothek-digital.de': codecs.decode(b'rRwGv2tbpKov1krvv7PO0ws9S436/lArPlfipz5Pqhw=','base64')}
self.is_printable = self.is_modifiable = self.is_extractable = True
- length = int_value(param.get('Length', 0)) / 8
+ length = int_value(param.get('Length', 0)) // 8
edcdata = str_value(param.get('EDCData')).decode('base64')
pdrllic = str_value(param.get('PDRLLic')).decode('base64')
pdrlpol = str_value(param.get('PDRLPol')).decode('base64')
edclist = []
- for pair in edcdata.split('\n'):
+ for pair in edcdata.split(b'\n'):
edclist.append(pair)
# principal key request
for key in principalkeys:
else:
raise IGNOBLEError('Cannot find principal key for this pdf')
shakey = SHA256(principalkey)
- ivector = 16 * chr(0)
+ ivector = bytes(16)
plaintext = AES.new(shakey,AES.MODE_CBC,ivector).decrypt(edclist[9].decode('base64'))
- if plaintext[-16:] != 16 * chr(16):
+ if plaintext[-16:] != bytearray(b'\0x10')*16:
raise IGNOBLEError('Offlinekey cannot be decrypted, aborting ...')
pdrlpol = AES.new(plaintext[16:32],AES.MODE_CBC,edclist[2].decode('base64')).decrypt(pdrlpol)
- if ord(pdrlpol[-1]) < 1 or ord(pdrlpol[-1]) > 16:
+ if pdrlpol[-1] < 1 or pdrlpol[-1] > 16:
raise IGNOBLEError('Could not decrypt PDRLPol, aborting ...')
else:
- cutter = -1 * ord(pdrlpol[-1])
+ cutter = -1 * pdrlpol[-1]
pdrlpol = pdrlpol[:cutter]
return plaintext[:16]
- PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..' \
- '\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
+ PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..' \
+ b'\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
# experimental aes pw support
def initialize_standard(self, password, docid, param):
# copy from a global variable
try:
EncMetadata = str_value(param['EncryptMetadata'])
except:
- EncMetadata = 'True'
+ EncMetadata = b'True'
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
hash.update(docid[0]) # 5
# aes special handling if metadata isn't encrypted
if EncMetadata == ('False' or 'false'):
- hash.update('ffffffff'.decode('hex'))
+ hash.update(codecs.decode(b'ffffffff','hex'))
if 5 <= R:
# 8
for _ in range(50):
- hash = hashlib.md5(hash.digest()[:length/8])
- key = hash.digest()[:length/8]
+ hash = hashlib.md5(hash.digest()[:length//8])
+ key = hash.digest()[:length//8]
if R == 2:
# Algorithm 3.4
u1 = ARC4.new(key).decrypt(password)
hash.update(docid[0]) # 3
x = ARC4.new(key).decrypt(hash.digest()[:16]) # 4
for i in range(1,19+1):
- k = ''.join( chr(ord(c) ^ i) for c in key )
+ k = b''.join(bytes([c ^ i]) for c in key )
x = ARC4.new(k).decrypt(x)
u1 = x+x # 32bytes total
if R == 2:
if V != 4:
self.decipher = self.decipher_rc4 # XXX may be AES
# aes
- elif V == 4 and Length == 128:
+ elif V == 4 and length == 128:
elf.decipher = self.decipher_aes
- elif V == 4 and Length == 256:
+ elif V == 4 and length == 256:
raise PDFNotImplementedError('AES256 encryption is currently unsupported')
self.ready = True
return
bookkey = ''.join(rights.findtext(expr)).decode('base64')
bookkey = aes.decrypt(bookkey)
bookkey = bookkey[:-ord(bookkey[-1])]
+ # todo: Take a look at this.
+ # This seems to be the only function that's different between ignoblepdf and ineptpdf.
+ # A ton of useless duplicated code .....
bookkey = bookkey[-16:]
ebx_V = int_value(param.get('V', 4))
ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6))
else:
V = 2
elif len(bookkey) == length + 1:
- V = ord(bookkey[0])
+ V = bookkey[0]
bookkey = bookkey[1:]
else:
print("ebx_V is %d and ebx_type is %d" % (ebx_V, ebx_type))
print("length is %d and len(bookkey) is %d" % (length, len(bookkey)))
- print("bookkey[0] is %d" % ord(bookkey[0]))
+ print("bookkey[0] is %d" % bookkey[0])
raise IGNOBLEError('error decrypting book session key - mismatched length')
else:
# proper length unknown try with whatever you have
objid = struct.pack('<L', objid ^ 0x3569ac)
genno = struct.pack('<L', genno ^ 0xca96)
key = self.decrypt_key
- key += objid[0] + genno[0] + objid[1] + genno[1] + objid[2] + 'sAlT'
+ key += objid[0] + genno[0] + objid[1] + genno[1] + objid[2] + b'sAlT'
hash = hashlib.md5(key)
key = hash.digest()[:min(len(self.decrypt_key) + 5, 16)]
return key
def genkey_v4(self, objid, genno):
objid = struct.pack('<L', objid)[:3]
genno = struct.pack('<L', genno)[:2]
- key = self.decrypt_key + objid + genno + 'sAlT'
+ key = self.decrypt_key + objid + genno + b'sAlT'
hash = hashlib.md5(key)
key = hash.digest()[:min(len(self.decrypt_key) + 5, 16)]
return key
data = data[16:]
plaintext = AES.new(key,AES.MODE_CBC,ivector).decrypt(data)
# remove pkcs#5 aes padding
- cutter = -1 * ord(plaintext[-1])
- #print(cutter)
+ cutter = -1 * plaintext[-1]
plaintext = plaintext[:cutter]
return plaintext
data = data[16:]
plaintext = AES.new(key,AES.MODE_CBC,ivector).decrypt(data)
# remove pkcs#5 aes padding
- cutter = -1 * ord(plaintext[-1])
- #print(cutter)
+ cutter = -1 * plaintext[-1]
plaintext = plaintext[:cutter]
return plaintext
return ARC4.new(key).decrypt(data)
- KEYWORD_OBJ = PSKeywordTable.intern('obj')
+ KEYWORD_OBJ = PSKeywordTable.intern(b'obj')
def getobj(self, objid):
if not self.ready:
def __repr__(self):
return '<PDFParser>'
- KEYWORD_R = PSKeywordTable.intern('R')
- KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
- KEYWORD_STREAM = PSKeywordTable.intern('stream')
- KEYWORD_XREF = PSKeywordTable.intern('xref')
- KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
+ KEYWORD_R = PSKeywordTable.intern(b'R')
+ KEYWORD_ENDOBJ = PSKeywordTable.intern(b'endobj')
+ KEYWORD_STREAM = PSKeywordTable.intern(b'stream')
+ KEYWORD_XREF = PSKeywordTable.intern(b'xref')
+ KEYWORD_STARTXREF = PSKeywordTable.intern(b'startxref')
def do_keyword(self, pos, token):
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
- if 'endstream' in line:
- i = line.index('endstream')
+ if b'endstream' in line:
+ i = line.index(b'endstream')
objlen += i
data += line[:i]
break
prev = None
for line in self.revreadlines():
line = line.strip()
- if line == 'startxref': break
+ if line == b'startxref': break
if line:
prev = line
else:
except PDFNoValidXRef:
# fallback
self.seek(0)
- pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
+ pat = re.compile(b'^(\\d+)\\s+(\\d+)\\s+obj\\b')
offsets = {}
xref = PDFXRef()
while 1:
(pos, line) = self.nextline()
except PSEOF:
break
- if line.startswith('trailer'):
+ if line.startswith(b'trailer'):
trailerpos = pos # remember last trailer
m = pat.match(line)
if not m: continue
self.add_results(*self.popall())
return
- KEYWORD_R = KWD('R')
+ KEYWORD_R = KWD(b'R')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
def dump(self, outf):
self.outf = outf
self.write(self.version)
- self.write('\n%\xe2\xe3\xcf\xd3\n')
+ self.write(b'\n%\xe2\xe3\xcf\xd3\n')
doc = self.doc
objids = self.objids
xrefs = {}
startxref = self.tell()
if not gen_xref_stm:
- self.write('xref\n')
- self.write('0 %d\n' % (maxobj + 1,))
+ self.write(b'xref\n')
+ self.write(b'0 %d\n' % (maxobj + 1,))
for objid in range(0, maxobj + 1):
if objid in xrefs:
# force the genno to be 0
- self.write("%010d 00000 n \n" % xrefs[objid][0])
+ self.write(b"%010d 00000 n \n" % xrefs[objid][0])
else:
- self.write("%010d %05d f \n" % (0, 65535))
+ self.write(b"%010d %05d f \n" % (0, 65535))
- self.write('trailer\n')
+ self.write(b'trailer\n')
self.serialize_object(trailer)
- self.write('\nstartxref\n%d\n%%%%EOF' % startxref)
+ self.write(b'\nstartxref\n%d\n%%%%EOF' % startxref)
else: # Generate crossref stream.
data.append(struct.pack('>L', f2)[-fl2:])
data.append(struct.pack('>L', f3)[-fl3:])
index.extend((first, prev - first + 1))
- data = zlib.compress(''.join(data))
+ data = zlib.compress(b''.join(data))
dic = {'Type': LITERAL_XREF, 'Size': prev + 1, 'Index': index,
'W': [1, fl2, fl3], 'Length': len(data),
'Filter': LITERALS_FLATE_DECODE[0],
dic['Info'] = trailer['Info']
xrefstm = PDFStream(dic, data)
self.serialize_indirect(maxobj, xrefstm)
- self.write('startxref\n%d\n%%%%EOF' % startxref)
+ self.write(b'startxref\n%d\n%%%%EOF' % startxref)
def write(self, data):
self.outf.write(data)
self.last = data[-1:]
return self.outf.tell()
def escape_string(self, string):
- string = string.replace('\\', '\\\\')
- string = string.replace('\n', r'\n')
- string = string.replace('(', r'\(')
- string = string.replace(')', r'\)')
- # get rid of ciando id
- regularexp = re.compile(r'http://www.ciando.com/index.cfm/intRefererID/\d{5}')
- if regularexp.match(string): return ('http://www.ciando.com')
+ string = string.replace(b'\\', b'\\\\')
+ string = string.replace(b'\n', b'\\n')
+ string = string.replace(b'(', b'\\(')
+ string = string.replace(b')', b'\\)')
return string
def serialize_object(self, obj):
obj['Subtype'] = obj['Type']
del obj['Type']
# end - hope this doesn't have bad effects
- self.write('<<')
+ self.write(b'<<')
for key, val in obj.items():
- self.write('/%s' % key)
+ self.write(str(PSLiteralTable.intern(key.encode('utf-8'))).encode('utf-8'))
self.serialize_object(val)
- self.write('>>')
+ self.write(b'>>')
elif isinstance(obj, list):
- self.write('[')
+ self.write(b'[')
for val in obj:
self.serialize_object(val)
- self.write(']')
+ self.write(b']')
+ elif isinstance(obj, bytearray):
+ self.write(b'(%s)' % self.escape_string(obj))
+ elif isinstance(obj, bytes):
+ self.write(b'(%s)' % self.escape_string(obj))
elif isinstance(obj, str):
- self.write('(%s)' % self.escape_string(obj))
+ self.write(b'(%s)' % self.escape_string(obj.encode('utf-8')))
elif isinstance(obj, bool):
if self.last.isalnum():
- self.write(' ')
- self.write(str(obj).lower())
+ self.write(b' ')
+ self.write(str(obj).lower().encode('utf-8'))
elif isinstance(obj, (int, long)):
if self.last.isalnum():
- self.write(' ')
- self.write(str(obj))
+ self.write(b' ')
+ self.write(str(obj).encode('utf-8'))
elif isinstance(obj, Decimal):
if self.last.isalnum():
- self.write(' ')
- self.write(str(obj))
+ self.write(b' ')
+ self.write(str(obj).encode('utf-8'))
elif isinstance(obj, PDFObjRef):
if self.last.isalnum():
- self.write(' ')
- self.write('%d %d R' % (obj.objid, 0))
+ self.write(b' ')
+ self.write(b'%d %d R' % (obj.objid, 0))
elif isinstance(obj, PDFStream):
### If we don't generate cross ref streams the object streams
### are no longer useful, as we have extracted all objects from
else:
data = obj.get_decdata()
self.serialize_object(obj.dic)
- self.write('stream\n')
+ self.write(b'stream\n')
self.write(data)
- self.write('\nendstream')
+ self.write(b'\nendstream')
else:
- data = str(obj)
- if data[0].isalnum() and self.last.isalnum():
- self.write(' ')
+ data = str(obj).encode('utf-8')
+ if bytes([data[0]]).isalnum() and self.last.isalnum():
+ self.write(b' ')
self.write(data)
def serialize_indirect(self, objid, obj):
- self.write('%d 0 obj' % (objid,))
+ self.write(b'%d 0 obj' % (objid,))
self.serialize_object(obj)
if self.last.isalnum():
- self.write('\n')
- self.write('endobj\n')
+ self.write(b'\n')
+ self.write(b'endobj\n')
if AES is None:
raise IGNOBLEError("PyCrypto or OpenSSL must be installed.")
with open(inpath, 'rb') as inf:
- #try:
serializer = PDFSerializer(inf, userkey)
- #except:
- # print("Error serializing pdf {0}. Probably wrong key.".format(os.path.basename(inpath)))
- # return 2
- # hope this will fix the 'bad file descriptor' problem
with open(outpath, 'wb') as outf:
# help construct to make sure the method runs to the end
try:
if self.encoding == None:
self.encoding = "utf-8"
def write(self, data):
- if isinstance(data, str):
+ if isinstance(data,str) or isinstance(data,unicode):
+ # str for Python3, unicode for Python2
data = data.encode(self.encoding,"replace")
- self.stream.buffer.write(data)
- self.stream.buffer.flush()
-
+ try:
+ buffer = getattr(self.stream, 'buffer', self.stream)
+ # self.stream.buffer for Python3, self.stream for Python2
+ buffer.write(data)
+ buffer.flush()
+ except:
+ # We can do nothing if a write fails
+ raise
def __getattr__(self, attr):
return getattr(self.stream, attr)
return ["ineptpdf.py"]
else:
argvencoding = sys.stdin.encoding or "utf-8"
- return [arg if isinstance(arg, str) else str(arg, argvencoding) for arg in sys.argv]
+ return [arg if (isinstance(arg, str) or isinstance(arg,unicode)) else str(arg, argvencoding) for arg in sys.argv]
class ADEPTError(Exception):
## PSBaseParser
##
-EOL = re.compile(rb'[\r\n]')
-SPC = re.compile(rb'\s')
-NONSPC = re.compile(rb'\S')
-HEX = re.compile(rb'[0-9a-fA-F]')
-END_LITERAL = re.compile(rb'[#/%\[\]()<>{}\s]')
-END_HEX_STRING = re.compile(rb'[^\s0-9a-fA-F]')
-HEX_PAIR = re.compile(rb'[0-9a-fA-F]{2}|.')
-END_NUMBER = re.compile(rb'[^0-9]')
-END_KEYWORD = re.compile(rb'[#/%\[\]()<>{}\s]')
-END_STRING = re.compile(rb'[()\\]')
-OCT_STRING = re.compile(rb'[0-7]')
+EOL = re.compile(br'[\r\n]')
+SPC = re.compile(br'\s')
+NONSPC = re.compile(br'\S')
+HEX = re.compile(br'[0-9a-fA-F]')
+END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
+END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
+HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
+END_NUMBER = re.compile(br'[^0-9]')
+END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
+END_STRING = re.compile(br'[()\\]')
+OCT_STRING = re.compile(br'[0-7]')
ESC_STRING = { b'b':8, b't':9, b'n':10, b'f':12, b'r':13, b'(':40, b')':41, b'\\':92 }
class PSBaseParser(object):
if not m:
return (self.parse_main, len(s))
j = m.start(0)
- c = bytes([s[j]])
+ if isinstance(s[j], str):
+ # Python 2
+ c = s[j]
+ else:
+ # Python 3
+ c = bytes([s[j]])
self.tokenstart = self.bufpos+j
if c == b'%':
self.token = c
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
- c = bytes([s[j]])
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
if c == b'#':
self.hex = b''
return (self.parse_literal_hex, j+1)
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
- c = bytes([s[i]])
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
- c = bytes([s[j]])
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
if c == b'.':
self.token += c
return (self.parse_decimal, j+1)
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
- c = bytes([s[j]])
+ if isinstance(s[j], str):
+ c = s[j]
+ else:
+ c = bytes([s[j]])
if c == b'\\':
self.oct = ''
return (self.parse_string_1, j+1)
return (self.parse_main, j+1)
def parse_string_1(self, s, i):
- c = bytes([s[i]])
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
- c = bytes([s[i]])
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
if c == b'<':
return (self.parse_main, i)
def parse_wclose(self, s, i):
- c = bytes([s[i]])
+ if isinstance(s[i], str):
+ c = s[i]
+ else:
+ c = bytes([s[i]])
if c == b'>':
self.add_token(KEYWORD_DICT_END)
i += 1
isinstance(token, bool) or
isinstance(token, bytearray) or
isinstance(token, bytes) or
+ isinstance(token, str) or
isinstance(token, PSLiteral)):
# normal token
self.push((pos, token))
'''
Recursively decipher X.
'''
- if isinstance(x, bytearray) or isinstance(x,bytes):
+ if isinstance(x, bytearray) or isinstance(x,bytes) or isinstance(x,str):
return decipher(objid, genno, x)
decf = lambda v: decipher_all(decipher, objid, genno, v)
if isinstance(x, list):
def str_value(x):
x = resolve1(x)
- if not (isinstance(x, bytearray) or isinstance(x, bytes)):
+ if not (isinstance(x, bytearray) or isinstance(x, bytes) or isinstance(x, str)):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
for xref in self.xrefs:
trailer = xref.trailer
if not trailer: continue
-
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
except PDFNoValidXRef:
# fallback
self.seek(0)
- pat = re.compile(rb'^(\d+)\s+(\d+)\s+obj\b')
+ pat = re.compile(b'^(\\d+)\\s+(\\d+)\\s+obj\\b')
offsets = {}
xref = PDFXRef()
while 1:
def escape_string(self, string):
string = string.replace(b'\\', b'\\\\')
- string = string.replace(b'\n', rb'\n')
- string = string.replace(b'(', rb'\(')
- string = string.replace(b')', rb'\)')
+ string = string.replace(b'\n', b'\\n')
+ string = string.replace(b'(', b'\\(')
+ string = string.replace(b')', b'\\)')
return string
def serialize_object(self, obj):