Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2# 

3# vim: sw=4:expandtab:foldmethod=marker 

4# 

5# Copyright (c) 2006, Mathieu Fenniak 

6# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> 

7# 

8# All rights reserved. 

9# 

10# Redistribution and use in source and binary forms, with or without 

11# modification, are permitted provided that the following conditions are 

12# met: 

13# 

14# * Redistributions of source code must retain the above copyright notice, 

15# this list of conditions and the following disclaimer. 

16# * Redistributions in binary form must reproduce the above copyright notice, 

17# this list of conditions and the following disclaimer in the documentation 

18# and/or other materials provided with the distribution. 

19# * The name of the author may not be used to endorse or promote products 

20# derived from this software without specific prior written permission. 

21# 

22# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

23# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

24# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

25# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 

26# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 

27# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 

28# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 

29# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 

30# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 

31# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 

32# POSSIBILITY OF SUCH DAMAGE. 

33 

34""" 

35A pure-Python PDF library with an increasing number of capabilities. 

36See README for links to FAQ, documentation, homepage, etc. 

37""" 

38 

39__author__ = "Mathieu Fenniak" 

40__author_email__ = "biziqe@mathieu.fenniak.net" 

41 

42__maintainer__ = "Phaseit, Inc." 

43__maintainer_email = "PyPDF2@phaseit.net" 

44 

45import string 

46import math 

47import struct 

48import sys 

49import uuid 

50from sys import version_info 

51if version_info < ( 3, 0 ): 

52 from cStringIO import StringIO 

53else: 

54 from io import StringIO 

55 

56if version_info < ( 3, 0 ): 

57 BytesIO = StringIO 

58else: 

59 from io import BytesIO 

60 

61from . import filters 

62from . import utils 

63import warnings 

64import codecs 

65from .generic import * 

66from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList 

67from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning 

68 

69if version_info < ( 2, 4 ): 

70 from sets import ImmutableSet as frozenset 

71 

72if version_info < ( 2, 5 ): 

73 from md5 import md5 

74else: 

75 from hashlib import md5 

76import uuid 

77 

78 

79class PdfFileWriter(object): 

80 """ 

81 This class supports writing PDF files out, given pages produced by another 

82 class (typically :class:`PdfFileReader<PdfFileReader>`). 

83 """ 

84 def __init__(self): 

85 self._header = b_("%PDF-1.3") 

86 self._objects = [] # array of indirect objects 

87 

88 # The root of our page tree node. 

89 pages = DictionaryObject() 

90 pages.update({ 

91 NameObject("/Type"): NameObject("/Pages"), 

92 NameObject("/Count"): NumberObject(0), 

93 NameObject("/Kids"): ArrayObject(), 

94 }) 

95 self._pages = self._addObject(pages) 

96 

97 # info object 

98 info = DictionaryObject() 

99 info.update({ 

100 NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be')) 

101 }) 

102 self._info = self._addObject(info) 

103 

104 # root object 

105 root = DictionaryObject() 

106 root.update({ 

107 NameObject("/Type"): NameObject("/Catalog"), 

108 NameObject("/Pages"): self._pages, 

109 }) 

110 self._root = None 

111 self._root_object = root 

112 

113 def _addObject(self, obj): 

114 self._objects.append(obj) 

115 return IndirectObject(len(self._objects), 0, self) 

116 

117 def getObject(self, ido): 

118 if ido.pdf != self: 

119 raise ValueError("pdf must be self") 

120 return self._objects[ido.idnum - 1] 

121 

122 def _addPage(self, page, action): 

123 assert page["/Type"] == "/Page" 

124 page[NameObject("/Parent")] = self._pages 

125 page = self._addObject(page) 

126 pages = self.getObject(self._pages) 

127 action(pages["/Kids"], page) 

128 pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) 

129 

130 def addPage(self, page): 

131 """ 

132 Adds a page to this PDF file. The page is usually acquired from a 

133 :class:`PdfFileReader<PdfFileReader>` instance. 

134 

135 :param PageObject page: The page to add to the document. Should be 

136 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` 

137 """ 

138 self._addPage(page, list.append) 

139 

140 def insertPage(self, page, index=0): 

141 """ 

142 Insert a page in this PDF file. The page is usually acquired from a 

143 :class:`PdfFileReader<PdfFileReader>` instance. 

144 

145 :param PageObject page: The page to add to the document. This 

146 argument should be an instance of :class:`PageObject<pdf.PageObject>`. 

147 :param int index: Position at which the page will be inserted. 

148 """ 

149 self._addPage(page, lambda l, p: l.insert(index, p)) 

150 

151 def getPage(self, pageNumber): 

152 """ 

153 Retrieves a page by number from this PDF file. 

154 

155 :param int pageNumber: The page number to retrieve 

156 (pages begin at zero) 

157 :return: the page at the index given by *pageNumber* 

158 :rtype: :class:`PageObject<pdf.PageObject>` 

159 """ 

160 pages = self.getObject(self._pages) 

161 # XXX: crude hack 

162 return pages["/Kids"][pageNumber].getObject() 

163 

164 def getNumPages(self): 

165 """ 

166 :return: the number of pages. 

167 :rtype: int 

168 """ 

169 pages = self.getObject(self._pages) 

170 return int(pages[NameObject("/Count")]) 

171 

172 def addBlankPage(self, width=None, height=None): 

173 """ 

174 Appends a blank page to this PDF file and returns it. If no page size 

175 is specified, use the size of the last page. 

176 

177 :param float width: The width of the new page expressed in default user 

178 space units. 

179 :param float height: The height of the new page expressed in default 

180 user space units. 

181 :return: the newly appended page 

182 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` 

183 :raises PageSizeNotDefinedError: if width and height are not defined 

184 and previous page does not exist. 

185 """ 

186 page = PageObject.createBlankPage(self, width, height) 

187 self.addPage(page) 

188 return page 

189 

190 def insertBlankPage(self, width=None, height=None, index=0): 

191 """ 

192 Inserts a blank page to this PDF file and returns it. If no page size 

193 is specified, use the size of the last page. 

194 

195 :param float width: The width of the new page expressed in default user 

196 space units. 

197 :param float height: The height of the new page expressed in default 

198 user space units. 

199 :param int index: Position to add the page. 

200 :return: the newly appended page 

201 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>` 

202 :raises PageSizeNotDefinedError: if width and height are not defined 

203 and previous page does not exist. 

204 """ 

205 if width is None or height is None and \ 

206 (self.getNumPages() - 1) >= index: 

207 oldpage = self.getPage(index) 

208 width = oldpage.mediaBox.getWidth() 

209 height = oldpage.mediaBox.getHeight() 

210 page = PageObject.createBlankPage(self, width, height) 

211 self.insertPage(page, index) 

212 return page 

213 

214 def addJS(self, javascript): 

215 """ 

216 Add Javascript which will launch upon opening this PDF. 

217 

218 :param str javascript: Your Javascript. 

219 

220 >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") 

221 # Example: This will launch the print window when the PDF is opened. 

222 """ 

223 js = DictionaryObject() 

224 js.update({ 

225 NameObject("/Type"): NameObject("/Action"), 

226 NameObject("/S"): NameObject("/JavaScript"), 

227 NameObject("/JS"): NameObject("(%s)" % javascript) 

228 }) 

229 js_indirect_object = self._addObject(js) 

230 

231 # We need a name for parameterized javascript in the pdf file, but it can be anything. 

232 js_string_name = str(uuid.uuid4()) 

233 

234 js_name_tree = DictionaryObject() 

235 js_name_tree.update({ 

236 NameObject("/JavaScript"): DictionaryObject({ 

237 NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object]) 

238 }) 

239 }) 

240 self._addObject(js_name_tree) 

241 

242 self._root_object.update({ 

243 NameObject("/OpenAction"): js_indirect_object, 

244 NameObject("/Names"): js_name_tree 

245 }) 

246 

247 def addAttachment(self, fname, fdata): 

248 """ 

249 Embed a file inside the PDF. 

250 

251 :param str fname: The filename to display. 

252 :param str fdata: The data in the file. 

253  

254 Reference: 

255 https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 

256 Section 7.11.3 

257 """ 

258 

259 # We need 3 entries: 

260 # * The file's data 

261 # * The /Filespec entry 

262 # * The file's name, which goes in the Catalog 

263 

264 

265 # The entry for the file 

266 """ Sample: 

267 8 0 obj 

268 << 

269 /Length 12 

270 /Type /EmbeddedFile 

271 >> 

272 stream 

273 Hello world! 

274 endstream 

275 endobj  

276 """ 

277 file_entry = DecodedStreamObject() 

278 file_entry.setData(fdata) 

279 file_entry.update({ 

280 NameObject("/Type"): NameObject("/EmbeddedFile") 

281 }) 

282 

283 # The Filespec entry 

284 """ Sample: 

285 7 0 obj 

286 << 

287 /Type /Filespec 

288 /F (hello.txt) 

289 /EF << /F 8 0 R >> 

290 >> 

291 """ 

292 efEntry = DictionaryObject() 

293 efEntry.update({ NameObject("/F"):file_entry }) 

294 

295 filespec = DictionaryObject() 

296 filespec.update({ 

297 NameObject("/Type"): NameObject("/Filespec"), 

298 NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject 

299 NameObject("/EF"): efEntry 

300 }) 

301 

302 # Then create the entry for the root, as it needs a reference to the Filespec 

303 """ Sample: 

304 1 0 obj 

305 << 

306 /Type /Catalog 

307 /Outlines 2 0 R 

308 /Pages 3 0 R 

309 /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> 

310 >> 

311 endobj 

312  

313 """ 

314 embeddedFilesNamesDictionary = DictionaryObject() 

315 embeddedFilesNamesDictionary.update({ 

316 NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) 

317 }) 

318 

319 embeddedFilesDictionary = DictionaryObject() 

320 embeddedFilesDictionary.update({ 

321 NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary 

322 }) 

323 # Update the root 

324 self._root_object.update({ 

325 NameObject("/Names"): embeddedFilesDictionary 

326 }) 

327 

328 def appendPagesFromReader(self, reader, after_page_append=None): 

329 """ 

330 Copy pages from reader to writer. Includes an optional callback parameter 

331 which is invoked after pages are appended to the writer. 

332  

333 :param reader: a PdfFileReader object from which to copy page 

334 annotations to this writer object. The writer's annots 

335 will then be updated 

336 :callback after_page_append (function): Callback function that is invoked after 

337 each page is appended to the writer. Callback signature: 

338 

339 :param writer_pageref (PDF page reference): Reference to the page 

340 appended to the writer. 

341 """ 

342 # Get page count from writer and reader 

343 reader_num_pages = reader.getNumPages() 

344 writer_num_pages = self.getNumPages() 

345 

346 # Copy pages from reader to writer 

347 for rpagenum in range(0, reader_num_pages): 

348 reader_page = reader.getPage(rpagenum) 

349 self.addPage(reader_page) 

350 writer_page = self.getPage(writer_num_pages+rpagenum) 

351 # Trigger callback, pass writer page as parameter 

352 if callable(after_page_append): after_page_append(writer_page) 

353 

354 def updatePageFormFieldValues(self, page, fields): 

355 ''' 

356 Update the form field values for a given page from a fields dictionary. 

357 Copy field texts and values from fields to page. 

358 

359 :param page: Page reference from PDF writer where the annotations 

360 and field data will be updated. 

361 :param fields: a Python dictionary of field names (/T) and text 

362 values (/V) 

363 ''' 

364 # Iterate through pages, update field values 

365 for j in range(0, len(page['/Annots'])): 

366 writer_annot = page['/Annots'][j].getObject() 

367 for field in fields: 

368 if writer_annot.get('/T') == field: 

369 writer_annot.update({ 

370 NameObject("/V"): TextStringObject(fields[field]) 

371 }) 

372 

373 def cloneReaderDocumentRoot(self, reader): 

374 ''' 

375 Copy the reader document root to the writer. 

376  

377 :param reader: PdfFileReader from the document root should be copied. 

378 :callback after_page_append 

379 ''' 

380 self._root_object = reader.trailer['/Root'] 

381 

382 def cloneDocumentFromReader(self, reader, after_page_append=None): 

383 ''' 

384 Create a copy (clone) of a document from a PDF file reader 

385 

386 :param reader: PDF file reader instance from which the clone 

387 should be created. 

388 :callback after_page_append (function): Callback function that is invoked after 

389 each page is appended to the writer. Signature includes a reference to the 

390 appended page (delegates to appendPagesFromReader). Callback signature: 

391 

392 :param writer_pageref (PDF page reference): Reference to the page just 

393 appended to the document. 

394 ''' 

395 self.cloneReaderDocumentRoot(reader) 

396 self.appendPagesFromReader(reader, after_page_append) 

397 

398 def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): 

399 """ 

400 Encrypt this PDF file with the PDF Standard encryption handler. 

401 

402 :param str user_pwd: The "user password", which allows for opening 

403 and reading the PDF file with the restrictions provided. 

404 :param str owner_pwd: The "owner password", which allows for 

405 opening the PDF files without any restrictions. By default, 

406 the owner password is the same as the user password. 

407 :param bool use_128bit: flag as to whether to use 128bit 

408 encryption. When false, 40bit encryption will be used. By default, 

409 this flag is on. 

410 """ 

411 import time, random 

412 if owner_pwd == None: 

413 owner_pwd = user_pwd 

414 if use_128bit: 

415 V = 2 

416 rev = 3 

417 keylen = int(128 / 8) 

418 else: 

419 V = 1 

420 rev = 2 

421 keylen = int(40 / 8) 

422 # permit everything: 

423 P = -1 

424 O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) 

425 ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) 

426 ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) 

427 self._ID = ArrayObject((ID_1, ID_2)) 

428 if rev == 2: 

429 U, key = _alg34(user_pwd, O, P, ID_1) 

430 else: 

431 assert rev == 3 

432 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) 

433 encrypt = DictionaryObject() 

434 encrypt[NameObject("/Filter")] = NameObject("/Standard") 

435 encrypt[NameObject("/V")] = NumberObject(V) 

436 if V == 2: 

437 encrypt[NameObject("/Length")] = NumberObject(keylen * 8) 

438 encrypt[NameObject("/R")] = NumberObject(rev) 

439 encrypt[NameObject("/O")] = ByteStringObject(O) 

440 encrypt[NameObject("/U")] = ByteStringObject(U) 

441 encrypt[NameObject("/P")] = NumberObject(P) 

442 self._encrypt = self._addObject(encrypt) 

443 self._encrypt_key = key 

444 

445 def write(self, stream): 

446 """ 

447 Writes the collection of pages added to this object out as a PDF file. 

448 

449 :param stream: An object to write the file to. The object must support 

450 the write method and the tell method, similar to a file object. 

451 """ 

452 if hasattr(stream, 'mode') and 'b' not in stream.mode: 

453 warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) 

454 debug = False 

455 import struct 

456 

457 if not self._root: 

458 self._root = self._addObject(self._root_object) 

459 

460 externalReferenceMap = {} 

461 

462 # PDF objects sometimes have circular references to their /Page objects 

463 # inside their object tree (for example, annotations). Those will be 

464 # indirect references to objects that we've recreated in this PDF. To 

465 # address this problem, PageObject's store their original object 

466 # reference number, and we add it to the external reference map before 

467 # we sweep for indirect references. This forces self-page-referencing 

468 # trees to reference the correct new object location, rather than 

469 # copying in a new copy of the page object. 

470 for objIndex in range(len(self._objects)): 

471 obj = self._objects[objIndex] 

472 if isinstance(obj, PageObject) and obj.indirectRef != None: 

473 data = obj.indirectRef 

474 if data.pdf not in externalReferenceMap: 

475 externalReferenceMap[data.pdf] = {} 

476 if data.generation not in externalReferenceMap[data.pdf]: 

477 externalReferenceMap[data.pdf][data.generation] = {} 

478 externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) 

479 

480 self.stack = [] 

481 if debug: print(("ERM:", externalReferenceMap, "root:", self._root)) 

482 self._sweepIndirectReferences(externalReferenceMap, self._root) 

483 del self.stack 

484 

485 # Begin writing: 

486 object_positions = [] 

487 stream.write(self._header + b_("\n")) 

488 for i in range(len(self._objects)): 

489 idnum = (i + 1) 

490 obj = self._objects[i] 

491 object_positions.append(stream.tell()) 

492 stream.write(b_(str(idnum) + " 0 obj\n")) 

493 key = None 

494 if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: 

495 pack1 = struct.pack("<i", i + 1)[:3] 

496 pack2 = struct.pack("<i", 0)[:2] 

497 key = self._encrypt_key + pack1 + pack2 

498 assert len(key) == (len(self._encrypt_key) + 5) 

499 md5_hash = md5(key).digest() 

500 key = md5_hash[:min(16, len(self._encrypt_key) + 5)] 

501 obj.writeToStream(stream, key) 

502 stream.write(b_("\nendobj\n")) 

503 

504 # xref table 

505 xref_location = stream.tell() 

506 stream.write(b_("xref\n")) 

507 stream.write(b_("0 %s\n" % (len(self._objects) + 1))) 

508 stream.write(b_("%010d %05d f \n" % (0, 65535))) 

509 for offset in object_positions: 

510 stream.write(b_("%010d %05d n \n" % (offset, 0))) 

511 

512 # trailer 

513 stream.write(b_("trailer\n")) 

514 trailer = DictionaryObject() 

515 trailer.update({ 

516 NameObject("/Size"): NumberObject(len(self._objects) + 1), 

517 NameObject("/Root"): self._root, 

518 NameObject("/Info"): self._info, 

519 }) 

520 if hasattr(self, "_ID"): 

521 trailer[NameObject("/ID")] = self._ID 

522 if hasattr(self, "_encrypt"): 

523 trailer[NameObject("/Encrypt")] = self._encrypt 

524 trailer.writeToStream(stream, None) 

525 

526 # eof 

527 stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) 

528 

529 def addMetadata(self, infos): 

530 """ 

531 Add custom metadata to the output. 

532 

533 :param dict infos: a Python dictionary where each key is a field 

534 and each value is your new metadata. 

535 """ 

536 args = {} 

537 for key, value in list(infos.items()): 

538 args[NameObject(key)] = createStringObject(value) 

539 self.getObject(self._info).update(args) 

540 

541 def _sweepIndirectReferences(self, externMap, data): 

542 debug = False 

543 if debug: print((data, "TYPE", data.__class__.__name__)) 

544 if isinstance(data, DictionaryObject): 

545 for key, value in list(data.items()): 

546 origvalue = value 

547 value = self._sweepIndirectReferences(externMap, value) 

548 if isinstance(value, StreamObject): 

549 # a dictionary value is a stream. streams must be indirect 

550 # objects, so we need to change this value. 

551 value = self._addObject(value) 

552 data[key] = value 

553 return data 

554 elif isinstance(data, ArrayObject): 

555 for i in range(len(data)): 

556 value = self._sweepIndirectReferences(externMap, data[i]) 

557 if isinstance(value, StreamObject): 

558 # an array value is a stream. streams must be indirect 

559 # objects, so we need to change this value 

560 value = self._addObject(value) 

561 data[i] = value 

562 return data 

563 elif isinstance(data, IndirectObject): 

564 # internal indirect references are fine 

565 if data.pdf == self: 

566 if data.idnum in self.stack: 

567 return data 

568 else: 

569 self.stack.append(data.idnum) 

570 realdata = self.getObject(data) 

571 self._sweepIndirectReferences(externMap, realdata) 

572 return data 

573 else: 

574 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None) 

575 if newobj == None: 

576 try: 

577 newobj = data.pdf.getObject(data) 

578 self._objects.append(None) # placeholder 

579 idnum = len(self._objects) 

580 newobj_ido = IndirectObject(idnum, 0, self) 

581 if data.pdf not in externMap: 

582 externMap[data.pdf] = {} 

583 if data.generation not in externMap[data.pdf]: 

584 externMap[data.pdf][data.generation] = {} 

585 externMap[data.pdf][data.generation][data.idnum] = newobj_ido 

586 newobj = self._sweepIndirectReferences(externMap, newobj) 

587 self._objects[idnum-1] = newobj 

588 return newobj_ido 

589 except ValueError: 

590 # Unable to resolve the Object, returning NullObject instead. 

591 return NullObject() 

592 return newobj 

593 else: 

594 return data 

595 

596 def getReference(self, obj): 

597 idnum = self._objects.index(obj) + 1 

598 ref = IndirectObject(idnum, 0, self) 

599 assert ref.getObject() == obj 

600 return ref 

601 

602 def getOutlineRoot(self): 

603 if '/Outlines' in self._root_object: 

604 outline = self._root_object['/Outlines'] 

605 idnum = self._objects.index(outline) + 1 

606 outlineRef = IndirectObject(idnum, 0, self) 

607 assert outlineRef.getObject() == outline 

608 else: 

609 outline = TreeObject() 

610 outline.update({ }) 

611 outlineRef = self._addObject(outline) 

612 self._root_object[NameObject('/Outlines')] = outlineRef 

613 

614 return outline 

615 

616 def getNamedDestRoot(self): 

617 if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject): 

618 names = self._root_object['/Names'] 

619 idnum = self._objects.index(names) + 1 

620 namesRef = IndirectObject(idnum, 0, self) 

621 assert namesRef.getObject() == names 

622 if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): 

623 dests = names['/Dests'] 

624 idnum = self._objects.index(dests) + 1 

625 destsRef = IndirectObject(idnum, 0, self) 

626 assert destsRef.getObject() == dests 

627 if '/Names' in dests: 

628 nd = dests['/Names'] 

629 else: 

630 nd = ArrayObject() 

631 dests[NameObject('/Names')] = nd 

632 else: 

633 dests = DictionaryObject() 

634 destsRef = self._addObject(dests) 

635 names[NameObject('/Dests')] = destsRef 

636 nd = ArrayObject() 

637 dests[NameObject('/Names')] = nd 

638 

639 else: 

640 names = DictionaryObject() 

641 namesRef = self._addObject(names) 

642 self._root_object[NameObject('/Names')] = namesRef 

643 dests = DictionaryObject() 

644 destsRef = self._addObject(dests) 

645 names[NameObject('/Dests')] = destsRef 

646 nd = ArrayObject() 

647 dests[NameObject('/Names')] = nd 

648 

649 return nd 

650 

651 def addBookmarkDestination(self, dest, parent=None): 

652 destRef = self._addObject(dest) 

653 

654 outlineRef = self.getOutlineRoot() 

655 

656 if parent == None: 

657 parent = outlineRef 

658 

659 parent = parent.getObject() 

660 #print parent.__class__.__name__ 

661 parent.addChild(destRef, self) 

662 

663 return destRef 

664 

665 def addBookmarkDict(self, bookmark, parent=None): 

666 bookmarkObj = TreeObject() 

667 for k, v in list(bookmark.items()): 

668 bookmarkObj[NameObject(str(k))] = v 

669 bookmarkObj.update(bookmark) 

670 

671 if '/A' in bookmark: 

672 action = DictionaryObject() 

673 for k, v in list(bookmark['/A'].items()): 

674 action[NameObject(str(k))] = v 

675 actionRef = self._addObject(action) 

676 bookmarkObj[NameObject('/A')] = actionRef 

677 

678 bookmarkRef = self._addObject(bookmarkObj) 

679 

680 outlineRef = self.getOutlineRoot() 

681 

682 if parent == None: 

683 parent = outlineRef 

684 

685 parent = parent.getObject() 

686 parent.addChild(bookmarkRef, self) 

687 

688 return bookmarkRef 

689 

690 def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args): 

691 """ 

692 Add a bookmark to this PDF file. 

693 

694 :param str title: Title to use for this bookmark. 

695 :param int pagenum: Page number this bookmark will point to. 

696 :param parent: A reference to a parent bookmark to create nested 

697 bookmarks. 

698 :param tuple color: Color of the bookmark as a red, green, blue tuple 

699 from 0.0 to 1.0 

700 :param bool bold: Bookmark is bold 

701 :param bool italic: Bookmark is italic 

702 :param str fit: The fit of the destination page. See 

703 :meth:`addLink()<addLink>` for details. 

704 """ 

705 pageRef = self.getObject(self._pages)['/Kids'][pagenum] 

706 action = DictionaryObject() 

707 zoomArgs = [] 

708 for a in args: 

709 if a is not None: 

710 zoomArgs.append(NumberObject(a)) 

711 else: 

712 zoomArgs.append(NullObject()) 

713 dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs) 

714 destArray = dest.getDestArray() 

715 action.update({ 

716 NameObject('/D') : destArray, 

717 NameObject('/S') : NameObject('/GoTo') 

718 }) 

719 actionRef = self._addObject(action) 

720 

721 outlineRef = self.getOutlineRoot() 

722 

723 if parent == None: 

724 parent = outlineRef 

725 

726 bookmark = TreeObject() 

727 

728 bookmark.update({ 

729 NameObject('/A'): actionRef, 

730 NameObject('/Title'): createStringObject(title), 

731 }) 

732 

733 if color is not None: 

734 bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])}) 

735 

736 format = 0 

737 if italic: 

738 format += 1 

739 if bold: 

740 format += 2 

741 if format: 

742 bookmark.update({NameObject('/F'): NumberObject(format)}) 

743 

744 bookmarkRef = self._addObject(bookmark) 

745 

746 parent = parent.getObject() 

747 parent.addChild(bookmarkRef, self) 

748 

749 return bookmarkRef 

750 

751 def addNamedDestinationObject(self, dest): 

752 destRef = self._addObject(dest) 

753 

754 nd = self.getNamedDestRoot() 

755 nd.extend([dest['/Title'], destRef]) 

756 

757 return destRef 

758 

759 def addNamedDestination(self, title, pagenum): 

760 pageRef = self.getObject(self._pages)['/Kids'][pagenum] 

761 dest = DictionaryObject() 

762 dest.update({ 

763 NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), 

764 NameObject('/S') : NameObject('/GoTo') 

765 }) 

766 

767 destRef = self._addObject(dest) 

768 nd = self.getNamedDestRoot() 

769 

770 nd.extend([title, destRef]) 

771 

772 return destRef 

773 

774 def removeLinks(self): 

775 """ 

776 Removes links and annotations from this output. 

777 """ 

778 pages = self.getObject(self._pages)['/Kids'] 

779 for page in pages: 

780 pageRef = self.getObject(page) 

781 if "/Annots" in pageRef: 

782 del pageRef['/Annots'] 

783 

784 def removeImages(self, ignoreByteStringObject=False): 

785 """ 

786 Removes images from this output. 

787 

788 :param bool ignoreByteStringObject: optional parameter 

789 to ignore ByteString Objects. 

790 """ 

791 pages = self.getObject(self._pages)['/Kids'] 

792 for j in range(len(pages)): 

793 page = pages[j] 

794 pageRef = self.getObject(page) 

795 content = pageRef['/Contents'].getObject() 

796 if not isinstance(content, ContentStream): 

797 content = ContentStream(content, pageRef) 

798 

799 _operations = [] 

800 seq_graphics = False 

801 for operands, operator in content.operations: 

802 if operator == b_('Tj'): 

803 text = operands[0] 

804 if ignoreByteStringObject: 

805 if not isinstance(text, TextStringObject): 

806 operands[0] = TextStringObject() 

807 elif operator == b_("'"): 

808 text = operands[0] 

809 if ignoreByteStringObject: 

810 if not isinstance(text, TextStringObject): 

811 operands[0] = TextStringObject() 

812 elif operator == b_('"'): 

813 text = operands[2] 

814 if ignoreByteStringObject: 

815 if not isinstance(text, TextStringObject): 

816 operands[2] = TextStringObject() 

817 elif operator == b_("TJ"): 

818 for i in range(len(operands[0])): 

819 if ignoreByteStringObject: 

820 if not isinstance(operands[0][i], TextStringObject): 

821 operands[0][i] = TextStringObject() 

822 

823 if operator == b_('q'): 

824 seq_graphics = True 

825 if operator == b_('Q'): 

826 seq_graphics = False 

827 if seq_graphics: 

828 if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), 

829 b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), 

830 b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: 

831 continue 

832 if operator == b_('re'): 

833 continue 

834 _operations.append((operands, operator)) 

835 

836 content.operations = _operations 

837 pageRef.__setitem__(NameObject('/Contents'), content) 

838 

839 def removeText(self, ignoreByteStringObject=False): 

840 """ 

841 Removes images from this output. 

842 

843 :param bool ignoreByteStringObject: optional parameter 

844 to ignore ByteString Objects. 

845 """ 

846 pages = self.getObject(self._pages)['/Kids'] 

847 for j in range(len(pages)): 

848 page = pages[j] 

849 pageRef = self.getObject(page) 

850 content = pageRef['/Contents'].getObject() 

851 if not isinstance(content, ContentStream): 

852 content = ContentStream(content, pageRef) 

853 for operands,operator in content.operations: 

854 if operator == b_('Tj'): 

855 text = operands[0] 

856 if not ignoreByteStringObject: 

857 if isinstance(text, TextStringObject): 

858 operands[0] = TextStringObject() 

859 else: 

860 if isinstance(text, TextStringObject) or \ 

861 isinstance(text, ByteStringObject): 

862 operands[0] = TextStringObject() 

863 elif operator == b_("'"): 

864 text = operands[0] 

865 if not ignoreByteStringObject: 

866 if isinstance(text, TextStringObject): 

867 operands[0] = TextStringObject() 

868 else: 

869 if isinstance(text, TextStringObject) or \ 

870 isinstance(text, ByteStringObject): 

871 operands[0] = TextStringObject() 

872 elif operator == b_('"'): 

873 text = operands[2] 

874 if not ignoreByteStringObject: 

875 if isinstance(text, TextStringObject): 

876 operands[2] = TextStringObject() 

877 else: 

878 if isinstance(text, TextStringObject) or \ 

879 isinstance(text, ByteStringObject): 

880 operands[2] = TextStringObject() 

881 elif operator == b_("TJ"): 

882 for i in range(len(operands[0])): 

883 if not ignoreByteStringObject: 

884 if isinstance(operands[0][i], TextStringObject): 

885 operands[0][i] = TextStringObject() 

886 else: 

887 if isinstance(operands[0][i], TextStringObject) or \ 

888 isinstance(operands[0][i], ByteStringObject): 

889 operands[0][i] = TextStringObject() 

890 

891 pageRef.__setitem__(NameObject('/Contents'), content) 

892 

893 def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): 

894 """ 

895 Add an internal link from a rectangular area to the specified page. 

896 

897 :param int pagenum: index of the page on which to place the link. 

898 :param int pagedest: index of the page to which the link should go. 

899 :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four 

900 integers specifying the clickable rectangular area 

901 ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. 

902 :param border: if provided, an array describing border-drawing 

903 properties. See the PDF spec for details. No border will be 

904 drawn if this argument is omitted. 

905 :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need 

906 to be supplied. Passing ``None`` will be read as a null value for that coordinate. 

907 

908 Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details): 

909 /Fit No additional arguments 

910 /XYZ [left] [top] [zoomFactor] 

911 /FitH [top] 

912 /FitV [left] 

913 /FitR [left] [bottom] [right] [top] 

914 /FitB No additional arguments 

915 /FitBH [top] 

916 /FitBV [left] 

917 """ 

918 

919 pageLink = self.getObject(self._pages)['/Kids'][pagenum] 

920 pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link 

921 pageRef = self.getObject(pageLink) 

922 

923 if border is not None: 

924 borderArr = [NameObject(n) for n in border[:3]] 

925 if len(border) == 4: 

926 dashPattern = ArrayObject([NameObject(n) for n in border[3]]) 

927 borderArr.append(dashPattern) 

928 else: 

929 borderArr = [NumberObject(0)] * 3 

930 

931 if isString(rect): 

932 rect = NameObject(rect) 

933 elif isinstance(rect, RectangleObject): 

934 pass 

935 else: 

936 rect = RectangleObject(rect) 

937 

938 zoomArgs = [] 

939 for a in args: 

940 if a is not None: 

941 zoomArgs.append(NumberObject(a)) 

942 else: 

943 zoomArgs.append(NullObject()) 

944 dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link 

945 destArray = dest.getDestArray() 

946 

947 lnk = DictionaryObject() 

948 lnk.update({ 

949 NameObject('/Type'): NameObject('/Annot'), 

950 NameObject('/Subtype'): NameObject('/Link'), 

951 NameObject('/P'): pageLink, 

952 NameObject('/Rect'): rect, 

953 NameObject('/Border'): ArrayObject(borderArr), 

954 NameObject('/Dest'): destArray 

955 }) 

956 lnkRef = self._addObject(lnk) 

957 

958 if "/Annots" in pageRef: 

959 pageRef['/Annots'].append(lnkRef) 

960 else: 

961 pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) 

962 

963 _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] 

964 

965 def getPageLayout(self): 

966 """ 

967 Get the page layout. 

968 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts. 

969 

970 :return: Page layout currently being used. 

971 :rtype: str, None if not specified 

972 """ 

973 try: 

974 return self._root_object['/PageLayout'] 

975 except KeyError: 

976 return None 

977 

978 def setPageLayout(self, layout): 

979 """ 

980 Set the page layout 

981 

982 :param str layout: The page layout to be used 

983 

984 Valid layouts are: 

985 /NoLayout Layout explicitly not specified 

986 /SinglePage Show one page at a time 

987 /OneColumn Show one column at a time 

988 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left 

989 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right 

990 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left 

991 /TwoPageRight Show two pages at a time, odd-numbered pages on the right 

992 """ 

993 if not isinstance(layout, NameObject): 

994 if layout not in self._valid_layouts: 

995 warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts))) 

996 layout = NameObject(layout) 

997 self._root_object.update({NameObject('/PageLayout'): layout}) 

998 

999 pageLayout = property(getPageLayout, setPageLayout) 

1000 """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>` 

1001 and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods.""" 

1002 

1003 _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments'] 

1004 

1005 def getPageMode(self): 

1006 """ 

1007 Get the page mode. 

1008 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description 

1009 of valid modes. 

1010 

1011 :return: Page mode currently being used. 

1012 :rtype: str, None if not specified 

1013 """ 

1014 try: 

1015 return self._root_object['/PageMode'] 

1016 except KeyError: 

1017 return None 

1018 

1019 def setPageMode(self, mode): 

1020 """ 

1021 Set the page mode. 

1022 

1023 :param str mode: The page mode to use. 

1024 

1025 Valid modes are: 

1026 /UseNone Do not show outlines or thumbnails panels 

1027 /UseOutlines Show outlines (aka bookmarks) panel 

1028 /UseThumbs Show page thumbnails panel 

1029 /FullScreen Fullscreen view 

1030 /UseOC Show Optional Content Group (OCG) panel 

1031 /UseAttachments Show attachments panel 

1032 """ 

1033 if not isinstance(mode, NameObject): 

1034 if mode not in self._valid_modes: 

1035 warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes))) 

1036 mode = NameObject(mode) 

1037 self._root_object.update({NameObject('/PageMode'): mode}) 

1038 

1039 pageMode = property(getPageMode, setPageMode) 

1040 """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>` 

1041 and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods.""" 

1042 

1043 

1044class PdfFileReader(object): 

1045 """ 

1046 Initializes a PdfFileReader object. This operation can take some time, as 

1047 the PDF stream's cross-reference tables are read into memory. 

1048 

1049 :param stream: A File object or an object that supports the standard read 

1050 and seek methods similar to a File object. Could also be a 

1051 string representing a path to a PDF file. 

1052 :param bool strict: Determines whether user should be warned of all 

1053 problems and also causes some correctable problems to be fatal. 

1054 Defaults to ``True``. 

1055 :param warndest: Destination for logging warnings (defaults to 

1056 ``sys.stderr``). 

1057 :param bool overwriteWarnings: Determines whether to override Python's 

1058 ``warnings.py`` module with a custom implementation (defaults to 

1059 ``True``). 

1060 """ 

1061 def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): 

1062 if overwriteWarnings: 

1063 # have to dynamically override the default showwarning since there are no 

1064 # public methods that specify the 'file' parameter 

1065 def _showwarning(message, category, filename, lineno, file=warndest, line=None): 

1066 if file is None: 

1067 file = sys.stderr 

1068 try: 

1069 file.write(formatWarning(message, category, filename, lineno, line)) 

1070 except IOError: 

1071 pass 

1072 warnings.showwarning = _showwarning 

1073 self.strict = strict 

1074 self.flattenedPages = None 

1075 self.resolvedObjects = {} 

1076 self.xrefIndex = 0 

1077 self._pageId2Num = None # map page IndirectRef number to Page Number 

1078 if hasattr(stream, 'mode') and 'b' not in stream.mode: 

1079 warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) 

1080 if isString(stream): 

1081 fileobj = open(stream, 'rb') 

1082 stream = BytesIO(b_(fileobj.read())) 

1083 fileobj.close() 

1084 self.read(stream) 

1085 self.stream = stream 

1086 

1087 self._override_encryption = False 

1088 

1089 def getDocumentInfo(self): 

1090 """ 

1091 Retrieves the PDF file's document information dictionary, if it exists. 

1092 Note that some PDF files use metadata streams instead of docinfo 

1093 dictionaries, and these metadata streams will not be accessed by this 

1094 function. 

1095 

1096 :return: the document information of this PDF file 

1097 :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists. 

1098 """ 

1099 if "/Info" not in self.trailer: 

1100 return None 

1101 obj = self.trailer['/Info'] 

1102 retval = DocumentInformation() 

1103 retval.update(obj) 

1104 return retval 

1105 

1106 documentInfo = property(lambda self: self.getDocumentInfo(), None, None) 

1107 """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function.""" 

1108 

1109 def getXmpMetadata(self): 

1110 """ 

1111 Retrieves XMP (Extensible Metadata Platform) data from the PDF document 

1112 root. 

1113 

1114 :return: a :class:`XmpInformation<xmp.XmpInformation>` 

1115 instance that can be used to access XMP metadata from the document. 

1116 :rtype: :class:`XmpInformation<xmp.XmpInformation>` or 

1117 ``None`` if no metadata was found on the document root. 

1118 """ 

1119 try: 

1120 self._override_encryption = True 

1121 return self.trailer["/Root"].getXmpMetadata() 

1122 finally: 

1123 self._override_encryption = False 

1124 

1125 xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) 

1126 """ 

1127 Read-only property that accesses the 

1128 :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function. 

1129 """ 

1130 

1131 def getNumPages(self): 

1132 """ 

1133 Calculates the number of pages in this PDF file. 

1134 

1135 :return: number of pages 

1136 :rtype: int 

1137 :raises PdfReadError: if file is encrypted and restrictions prevent 

1138 this action. 

1139 """ 

1140 

1141 # Flattened pages will not work on an Encrypted PDF; 

1142 # the PDF file's page count is used in this case. Otherwise, 

1143 # the original method (flattened page count) is used. 

1144 if self.isEncrypted: 

1145 try: 

1146 self._override_encryption = True 

1147 self.decrypt('') 

1148 return self.trailer["/Root"]["/Pages"]["/Count"] 

1149 except: 

1150 raise utils.PdfReadError("File has not been decrypted") 

1151 finally: 

1152 self._override_encryption = False 

1153 else: 

1154 if self.flattenedPages == None: 

1155 self._flatten() 

1156 return len(self.flattenedPages) 

1157 

1158 numPages = property(lambda self: self.getNumPages(), None, None) 

1159 """ 

1160 Read-only property that accesses the 

1161 :meth:`getNumPages()<PdfFileReader.getNumPages>` function. 

1162 """ 

1163 

1164 def getPage(self, pageNumber): 

1165 """ 

1166 Retrieves a page by number from this PDF file. 

1167 

1168 :param int pageNumber: The page number to retrieve 

1169 (pages begin at zero) 

1170 :return: a :class:`PageObject<pdf.PageObject>` instance. 

1171 :rtype: :class:`PageObject<pdf.PageObject>` 

1172 """ 

1173 ## ensure that we're not trying to access an encrypted PDF 

1174 #assert not self.trailer.has_key("/Encrypt") 

1175 if self.flattenedPages == None: 

1176 self._flatten() 

1177 return self.flattenedPages[pageNumber] 

1178 

1179 namedDestinations = property(lambda self: 

1180 self.getNamedDestinations(), None, None) 

1181 """ 

1182 Read-only property that accesses the 

1183 :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function. 

1184 """ 

1185 

1186 # A select group of relevant field attributes. For the complete list, 

1187 # see section 8.6.2 of the PDF 1.7 reference. 

1188 

1189 def getFields(self, tree = None, retval = None, fileobj = None): 

1190 """ 

1191 Extracts field data if this PDF contains interactive form fields. 

1192 The *tree* and *retval* parameters are for recursive use. 

1193 

1194 :param fileobj: A file object (usually a text file) to write 

1195 a report to on all interactive form fields found. 

1196 :return: A dictionary where each key is a field name, and each 

1197 value is a :class:`Field<PyPDF2.generic.Field>` object. By 

1198 default, the mapping name is used for keys. 

1199 :rtype: dict, or ``None`` if form data could not be located. 

1200 """ 

1201 fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent", 

1202 "/T" : "Field Name", "/TU" : "Alternate Field Name", 

1203 "/TM" : "Mapping Name", "/Ff" : "Field Flags", 

1204 "/V" : "Value", "/DV" : "Default Value"} 

1205 if retval == None: 

1206 retval = {} 

1207 catalog = self.trailer["/Root"] 

1208 # get the AcroForm tree 

1209 if "/AcroForm" in catalog: 

1210 tree = catalog["/AcroForm"] 

1211 else: 

1212 return None 

1213 if tree == None: 

1214 return retval 

1215 

1216 self._checkKids(tree, retval, fileobj) 

1217 for attr in fieldAttributes: 

1218 if attr in tree: 

1219 # Tree is a field 

1220 self._buildField(tree, retval, fileobj, fieldAttributes) 

1221 break 

1222 

1223 if "/Fields" in tree: 

1224 fields = tree["/Fields"] 

1225 for f in fields: 

1226 field = f.getObject() 

1227 self._buildField(field, retval, fileobj, fieldAttributes) 

1228 

1229 return retval 

1230 

1231 def _buildField(self, field, retval, fileobj, fieldAttributes): 

1232 self._checkKids(field, retval, fileobj) 

1233 try: 

1234 key = field["/TM"] 

1235 except KeyError: 

1236 try: 

1237 key = field["/T"] 

1238 except KeyError: 

1239 # Ignore no-name field for now 

1240 return 

1241 if fileobj: 

1242 self._writeField(fileobj, field, fieldAttributes) 

1243 fileobj.write("\n") 

1244 retval[key] = Field(field) 

1245 

1246 def _checkKids(self, tree, retval, fileobj): 

1247 if "/Kids" in tree: 

1248 # recurse down the tree 

1249 for kid in tree["/Kids"]: 

1250 self.getFields(kid.getObject(), retval, fileobj) 

1251 

1252 def _writeField(self, fileobj, field, fieldAttributes): 

1253 order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"] 

1254 for attr in order: 

1255 attrName = fieldAttributes[attr] 

1256 try: 

1257 if attr == "/FT": 

1258 # Make the field type value more clear 

1259 types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice", 

1260 "/Sig":"Signature"} 

1261 if field[attr] in types: 

1262 fileobj.write(attrName + ": " + types[field[attr]] + "\n") 

1263 elif attr == "/Parent": 

1264 # Let's just write the name of the parent 

1265 try: 

1266 name = field["/Parent"]["/TM"] 

1267 except KeyError: 

1268 name = field["/Parent"]["/T"] 

1269 fileobj.write(attrName + ": " + name + "\n") 

1270 else: 

1271 fileobj.write(attrName + ": " + str(field[attr]) + "\n") 

1272 except KeyError: 

1273 # Field attribute is N/A or unknown, so don't write anything 

1274 pass 

1275 

1276 def getFormTextFields(self): 

1277 ''' Retrieves form fields from the document with textual data (inputs, dropdowns) 

1278 ''' 

1279 # Retrieve document form fields 

1280 formfields = self.getFields() 

1281 return dict( 

1282 (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ 

1283 if formfields[field].get('/FT') == '/Tx' 

1284 ) 

1285 

1286 def getNamedDestinations(self, tree=None, retval=None): 

1287 """ 

1288 Retrieves the named destinations present in the document. 

1289 

1290 :return: a dictionary which maps names to 

1291 :class:`Destinations<PyPDF2.generic.Destination>`. 

1292 :rtype: dict 

1293 """ 

1294 if retval == None: 

1295 retval = {} 

1296 catalog = self.trailer["/Root"] 

1297 

1298 # get the name tree 

1299 if "/Dests" in catalog: 

1300 tree = catalog["/Dests"] 

1301 elif "/Names" in catalog: 

1302 names = catalog['/Names'] 

1303 if "/Dests" in names: 

1304 tree = names['/Dests'] 

1305 

1306 if tree == None: 

1307 return retval 

1308 

1309 if "/Kids" in tree: 

1310 # recurse down the tree 

1311 for kid in tree["/Kids"]: 

1312 self.getNamedDestinations(kid.getObject(), retval) 

1313 

1314 if "/Names" in tree: 

1315 names = tree["/Names"] 

1316 for i in range(0, len(names), 2): 

1317 key = names[i].getObject() 

1318 val = names[i+1].getObject() 

1319 if isinstance(val, DictionaryObject) and '/D' in val: 

1320 val = val['/D'] 

1321 dest = self._buildDestination(key, val) 

1322 if dest != None: 

1323 retval[key] = dest 

1324 

1325 return retval 

1326 

1327 outlines = property(lambda self: self.getOutlines(), None, None) 

1328 """ 

1329 Read-only property that accesses the 

1330 :meth:`getOutlines()<PdfFileReader.getOutlines>` function. 

1331 """ 

1332 

1333 def getOutlines(self, node=None, outlines=None): 

1334 """ 

1335 Retrieves the document outline present in the document. 

1336 

1337 :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`. 

1338 """ 

1339 if outlines == None: 

1340 outlines = [] 

1341 catalog = self.trailer["/Root"] 

1342 

1343 # get the outline dictionary and named destinations 

1344 if "/Outlines" in catalog: 

1345 try: 

1346 lines = catalog["/Outlines"] 

1347 except utils.PdfReadError: 

1348 # this occurs if the /Outlines object reference is incorrect 

1349 # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf 

1350 # so continue to load the file without the Bookmarks 

1351 return outlines 

1352 

1353 if "/First" in lines: 

1354 node = lines["/First"] 

1355 self._namedDests = self.getNamedDestinations() 

1356 

1357 if node == None: 

1358 return outlines 

1359 

1360 # see if there are any more outlines 

1361 while True: 

1362 outline = self._buildOutline(node) 

1363 if outline: 

1364 outlines.append(outline) 

1365 

1366 # check for sub-outlines 

1367 if "/First" in node: 

1368 subOutlines = [] 

1369 self.getOutlines(node["/First"], subOutlines) 

1370 if subOutlines: 

1371 outlines.append(subOutlines) 

1372 

1373 if "/Next" not in node: 

1374 break 

1375 node = node["/Next"] 

1376 

1377 return outlines 

1378 

1379 def _getPageNumberByIndirect(self, indirectRef): 

1380 """Generate _pageId2Num""" 

1381 if self._pageId2Num is None: 

1382 id2num = {} 

1383 for i, x in enumerate(self.pages): 

1384 id2num[x.indirectRef.idnum] = i 

1385 self._pageId2Num = id2num 

1386 

1387 if isinstance(indirectRef, int): 

1388 idnum = indirectRef 

1389 else: 

1390 idnum = indirectRef.idnum 

1391 

1392 ret = self._pageId2Num.get(idnum, -1) 

1393 return ret 

1394 

1395 def getPageNumber(self, page): 

1396 """ 

1397 Retrieve page number of a given PageObject 

1398 

1399 :param PageObject page: The page to get page number. Should be 

1400 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>` 

1401 :return: the page number or -1 if page not found 

1402 :rtype: int 

1403 """ 

1404 indirectRef = page.indirectRef 

1405 ret = self._getPageNumberByIndirect(indirectRef) 

1406 return ret 

1407 

1408 def getDestinationPageNumber(self, destination): 

1409 """ 

1410 Retrieve page number of a given Destination object 

1411 

1412 :param Destination destination: The destination to get page number. 

1413 Should be an instance of 

1414 :class:`Destination<PyPDF2.pdf.Destination>` 

1415 :return: the page number or -1 if page not found 

1416 :rtype: int 

1417 """ 

1418 indirectRef = destination.page 

1419 ret = self._getPageNumberByIndirect(indirectRef) 

1420 return ret 

1421 

1422 def _buildDestination(self, title, array): 

1423 page, typ = array[0:2] 

1424 array = array[2:] 

1425 return Destination(title, page, typ, *array) 

1426 

1427 def _buildOutline(self, node): 

1428 dest, title, outline = None, None, None 

1429 

1430 if "/A" in node and "/Title" in node: 

1431 # Action, section 8.5 (only type GoTo supported) 

1432 title = node["/Title"] 

1433 action = node["/A"] 

1434 if action["/S"] == "/GoTo": 

1435 dest = action["/D"] 

1436 elif "/Dest" in node and "/Title" in node: 

1437 # Destination, section 8.2.1 

1438 title = node["/Title"] 

1439 dest = node["/Dest"] 

1440 

1441 # if destination found, then create outline 

1442 if dest: 

1443 if isinstance(dest, ArrayObject): 

1444 outline = self._buildDestination(title, dest) 

1445 elif isString(dest) and dest in self._namedDests: 

1446 outline = self._namedDests[dest] 

1447 outline[NameObject("/Title")] = title 

1448 else: 

1449 raise utils.PdfReadError("Unexpected destination %r" % dest) 

1450 return outline 

1451 

1452 pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), 

1453 None, None) 

1454 """ 

1455 Read-only property that emulates a list based upon the 

1456 :meth:`getNumPages()<PdfFileReader.getNumPages>` and 

1457 :meth:`getPage()<PdfFileReader.getPage>` methods. 

1458 """ 

1459 

1460 def getPageLayout(self): 

1461 """ 

1462 Get the page layout. 

1463 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` 

1464 for a description of valid layouts. 

1465 

1466 :return: Page layout currently being used. 

1467 :rtype: ``str``, ``None`` if not specified 

1468 """ 

1469 try: 

1470 return self.trailer['/Root']['/PageLayout'] 

1471 except KeyError: 

1472 return None 

1473 

1474 pageLayout = property(getPageLayout) 

1475 """Read-only property accessing the 

1476 :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method.""" 

1477 

1478 def getPageMode(self): 

1479 """ 

1480 Get the page mode. 

1481 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` 

1482 for a description of valid modes. 

1483 

1484 :return: Page mode currently being used. 

1485 :rtype: ``str``, ``None`` if not specified 

1486 """ 

1487 try: 

1488 return self.trailer['/Root']['/PageMode'] 

1489 except KeyError: 

1490 return None 

1491 

1492 pageMode = property(getPageMode) 

1493 """Read-only property accessing the 

1494 :meth:`getPageMode()<PdfFileReader.getPageMode>` method.""" 

1495 

1496 def _flatten(self, pages=None, inherit=None, indirectRef=None): 

1497 inheritablePageAttributes = ( 

1498 NameObject("/Resources"), NameObject("/MediaBox"), 

1499 NameObject("/CropBox"), NameObject("/Rotate") 

1500 ) 

1501 if inherit == None: 

1502 inherit = dict() 

1503 if pages == None: 

1504 self.flattenedPages = [] 

1505 catalog = self.trailer["/Root"].getObject() 

1506 pages = catalog["/Pages"].getObject() 

1507 

1508 t = "/Pages" 

1509 if "/Type" in pages: 

1510 t = pages["/Type"] 

1511 

1512 if t == "/Pages": 

1513 for attr in inheritablePageAttributes: 

1514 if attr in pages: 

1515 inherit[attr] = pages[attr] 

1516 for page in pages["/Kids"]: 

1517 addt = {} 

1518 if isinstance(page, IndirectObject): 

1519 addt["indirectRef"] = page 

1520 self._flatten(page.getObject(), inherit, **addt) 

1521 elif t == "/Page": 

1522 for attr, value in list(inherit.items()): 

1523 # if the page has it's own value, it does not inherit the 

1524 # parent's value: 

1525 if attr not in pages: 

1526 pages[attr] = value 

1527 pageObj = PageObject(self, indirectRef) 

1528 pageObj.update(pages) 

1529 self.flattenedPages.append(pageObj) 

1530 

1531 def _getObjectFromStream(self, indirectReference): 

1532 # indirect reference to object in object stream 

1533 # read the entire object stream into memory 

1534 debug = False 

1535 stmnum, idx = self.xref_objStm[indirectReference.idnum] 

1536 if debug: print(("Here1: %s %s"%(stmnum, idx))) 

1537 objStm = IndirectObject(stmnum, 0, self).getObject() 

1538 if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData()))) 

1539 # This is an xref to a stream, so its type better be a stream 

1540 assert objStm['/Type'] == '/ObjStm' 

1541 # /N is the number of indirect objects in the stream 

1542 assert idx < objStm['/N'] 

1543 streamData = BytesIO(b_(objStm.getData())) 

1544 for i in range(objStm['/N']): 

1545 readNonWhitespace(streamData) 

1546 streamData.seek(-1, 1) 

1547 objnum = NumberObject.readFromStream(streamData) 

1548 readNonWhitespace(streamData) 

1549 streamData.seek(-1, 1) 

1550 offset = NumberObject.readFromStream(streamData) 

1551 readNonWhitespace(streamData) 

1552 streamData.seek(-1, 1) 

1553 if objnum != indirectReference.idnum: 

1554 # We're only interested in one object 

1555 continue 

1556 if self.strict and idx != i: 

1557 raise utils.PdfReadError("Object is in wrong index.") 

1558 streamData.seek(objStm['/First']+offset, 0) 

1559 if debug: 

1560 pos = streamData.tell() 

1561 streamData.seek(0, 0) 

1562 lines = streamData.readlines() 

1563 for i in range(0, len(lines)): 

1564 print((lines[i])) 

1565 streamData.seek(pos, 0) 

1566 try: 

1567 obj = readObject(streamData, self) 

1568 except utils.PdfStreamError as e: 

1569 # Stream object cannot be read. Normally, a critical error, but 

1570 # Adobe Reader doesn't complain, so continue (in strict mode?) 

1571 e = sys.exc_info()[1] 

1572 warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ 

1573 (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) 

1574 

1575 if self.strict: 

1576 raise utils.PdfReadError("Can't read object stream: %s"%e) 

1577 # Replace with null. Hopefully it's nothing important. 

1578 obj = NullObject() 

1579 return obj 

1580 

1581 if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") 

1582 return NullObject() 

1583 

1584 def getObject(self, indirectReference): 

1585 debug = False 

1586 if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) 

1587 retval = self.cacheGetIndirectObject(indirectReference.generation, 

1588 indirectReference.idnum) 

1589 if retval != None: 

1590 return retval 

1591 if indirectReference.generation == 0 and \ 

1592 indirectReference.idnum in self.xref_objStm: 

1593 retval = self._getObjectFromStream(indirectReference) 

1594 elif indirectReference.generation in self.xref and \ 

1595 indirectReference.idnum in self.xref[indirectReference.generation]: 

1596 start = self.xref[indirectReference.generation][indirectReference.idnum] 

1597 if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start)) 

1598 self.stream.seek(start, 0) 

1599 idnum, generation = self.readObjectHeader(self.stream) 

1600 if idnum != indirectReference.idnum and self.xrefIndex: 

1601 # Xref table probably had bad indexes due to not being zero-indexed 

1602 if self.strict: 

1603 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ 

1604 % (indirectReference.idnum, indirectReference.generation, idnum, generation)) 

1605 else: pass # xref table is corrected in non-strict mode 

1606 elif idnum != indirectReference.idnum: 

1607 # some other problem 

1608 raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ 

1609 % (indirectReference.idnum, indirectReference.generation, idnum, generation)) 

1610 assert generation == indirectReference.generation 

1611 retval = readObject(self.stream, self) 

1612 

1613 # override encryption is used for the /Encrypt dictionary 

1614 if not self._override_encryption and self.isEncrypted: 

1615 # if we don't have the encryption key: 

1616 if not hasattr(self, '_decryption_key'): 

1617 raise utils.PdfReadError("file has not been decrypted") 

1618 # otherwise, decrypt here... 

1619 import struct 

1620 pack1 = struct.pack("<i", indirectReference.idnum)[:3] 

1621 pack2 = struct.pack("<i", indirectReference.generation)[:2] 

1622 key = self._decryption_key + pack1 + pack2 

1623 assert len(key) == (len(self._decryption_key) + 5) 

1624 md5_hash = md5(key).digest() 

1625 key = md5_hash[:min(16, len(self._decryption_key) + 5)] 

1626 retval = self._decryptObject(retval, key) 

1627 else: 

1628 warnings.warn("Object %d %d not defined."%(indirectReference.idnum, 

1629 indirectReference.generation), utils.PdfReadWarning) 

1630 #if self.strict: 

1631 raise utils.PdfReadError("Could not find object.") 

1632 self.cacheIndirectObject(indirectReference.generation, 

1633 indirectReference.idnum, retval) 

1634 return retval 

1635 

1636 def _decryptObject(self, obj, key): 

1637 if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject): 

1638 obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) 

1639 elif isinstance(obj, StreamObject): 

1640 obj._data = utils.RC4_encrypt(key, obj._data) 

1641 elif isinstance(obj, DictionaryObject): 

1642 for dictkey, value in list(obj.items()): 

1643 obj[dictkey] = self._decryptObject(value, key) 

1644 elif isinstance(obj, ArrayObject): 

1645 for i in range(len(obj)): 

1646 obj[i] = self._decryptObject(obj[i], key) 

1647 return obj 

1648 

1649 def readObjectHeader(self, stream): 

1650 # Should never be necessary to read out whitespace, since the 

1651 # cross-reference table should put us in the right spot to read the 

1652 # object header. In reality... some files have stupid cross reference 

1653 # tables that are off by whitespace bytes. 

1654 extra = False 

1655 utils.skipOverComment(stream) 

1656 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) 

1657 idnum = readUntilWhitespace(stream) 

1658 extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1) 

1659 generation = readUntilWhitespace(stream) 

1660 obj = stream.read(3) 

1661 readNonWhitespace(stream) 

1662 stream.seek(-1, 1) 

1663 if (extra and self.strict): 

1664 #not a fatal error 

1665 warnings.warn("Superfluous whitespace found in object header %s %s" % \ 

1666 (idnum, generation), utils.PdfReadWarning) 

1667 return int(idnum), int(generation) 

1668 

1669 def cacheGetIndirectObject(self, generation, idnum): 

1670 debug = False 

1671 out = self.resolvedObjects.get((generation, idnum)) 

1672 if debug and out: print(("cache hit: %d %d"%(idnum, generation))) 

1673 elif debug: print(("cache miss: %d %d"%(idnum, generation))) 

1674 return out 

1675 

1676 def cacheIndirectObject(self, generation, idnum, obj): 

1677 # return None # Sometimes we want to turn off cache for debugging. 

1678 if (generation, idnum) in self.resolvedObjects: 

1679 msg = "Overwriting cache for %s %s"%(generation, idnum) 

1680 if self.strict: raise utils.PdfReadError(msg) 

1681 else: warnings.warn(msg) 

1682 self.resolvedObjects[(generation, idnum)] = obj 

1683 return obj 

1684 

1685 def read(self, stream): 

1686 debug = False 

1687 if debug: print(">>read", stream) 

1688 # start at the end: 

1689 stream.seek(-1, 2) 

1690 if not stream.tell(): 

1691 raise utils.PdfReadError('Cannot read an empty file') 

1692 last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream 

1693 line = b_('') 

1694 while line[:5] != b_("%%EOF"): 

1695 if stream.tell() < last1K: 

1696 raise utils.PdfReadError("EOF marker not found") 

1697 line = self.readNextEndLine(stream) 

1698 if debug: print(" line:",line) 

1699 

1700 # find startxref entry - the location of the xref table 

1701 line = self.readNextEndLine(stream) 

1702 try: 

1703 startxref = int(line) 

1704 except ValueError: 

1705 # 'startxref' may be on the same line as the location 

1706 if not line.startswith(b_("startxref")): 

1707 raise utils.PdfReadError("startxref not found") 

1708 startxref = int(line[9:].strip()) 

1709 warnings.warn("startxref on same line as offset") 

1710 else: 

1711 line = self.readNextEndLine(stream) 

1712 if line[:9] != b_("startxref"): 

1713 raise utils.PdfReadError("startxref not found") 

1714 

1715 # read all cross reference tables and their trailers 

1716 self.xref = {} 

1717 self.xref_objStm = {} 

1718 self.trailer = DictionaryObject() 

1719 while True: 

1720 # load the xref table 

1721 stream.seek(startxref, 0) 

1722 x = stream.read(1) 

1723 if x == b_("x"): 

1724 # standard cross-reference table 

1725 ref = stream.read(4) 

1726 if ref[:3] != b_("ref"): 

1727 raise utils.PdfReadError("xref table read error") 

1728 readNonWhitespace(stream) 

1729 stream.seek(-1, 1) 

1730 firsttime = True; # check if the first time looking at the xref table 

1731 while True: 

1732 num = readObject(stream, self) 

1733 if firsttime and num != 0: 

1734 self.xrefIndex = num 

1735 if self.strict: 

1736 warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning) 

1737 #if table not zero indexed, could be due to error from when PDF was created 

1738 #which will lead to mismatched indices later on, only warned and corrected if self.strict=True 

1739 firsttime = False 

1740 readNonWhitespace(stream) 

1741 stream.seek(-1, 1) 

1742 size = readObject(stream, self) 

1743 readNonWhitespace(stream) 

1744 stream.seek(-1, 1) 

1745 cnt = 0 

1746 while cnt < size: 

1747 line = stream.read(20) 

1748 

1749 # It's very clear in section 3.4.3 of the PDF spec 

1750 # that all cross-reference table lines are a fixed 

1751 # 20 bytes (as of PDF 1.7). However, some files have 

1752 # 21-byte entries (or more) due to the use of \r\n 

1753 # (CRLF) EOL's. Detect that case, and adjust the line 

1754 # until it does not begin with a \r (CR) or \n (LF). 

1755 while line[0] in b_("\x0D\x0A"): 

1756 stream.seek(-20 + 1, 1) 

1757 line = stream.read(20) 

1758 

1759 # On the other hand, some malformed PDF files 

1760 # use a single character EOL without a preceeding 

1761 # space. Detect that case, and seek the stream 

1762 # back one character. (0-9 means we've bled into 

1763 # the next xref entry, t means we've bled into the 

1764 # text "trailer"): 

1765 if line[-1] in b_("0123456789t"): 

1766 stream.seek(-1, 1) 

1767 

1768 offset, generation = line[:16].split(b_(" ")) 

1769 offset, generation = int(offset), int(generation) 

1770 if generation not in self.xref: 

1771 self.xref[generation] = {} 

1772 if num in self.xref[generation]: 

1773 # It really seems like we should allow the last 

1774 # xref table in the file to override previous 

1775 # ones. Since we read the file backwards, assume 

1776 # any existing key is already set correctly. 

1777 pass 

1778 else: 

1779 self.xref[generation][num] = offset 

1780 cnt += 1 

1781 num += 1 

1782 readNonWhitespace(stream) 

1783 stream.seek(-1, 1) 

1784 trailertag = stream.read(7) 

1785 if trailertag != b_("trailer"): 

1786 # more xrefs! 

1787 stream.seek(-7, 1) 

1788 else: 

1789 break 

1790 readNonWhitespace(stream) 

1791 stream.seek(-1, 1) 

1792 newTrailer = readObject(stream, self) 

1793 for key, value in list(newTrailer.items()): 

1794 if key not in self.trailer: 

1795 self.trailer[key] = value 

1796 if "/Prev" in newTrailer: 

1797 startxref = newTrailer["/Prev"] 

1798 else: 

1799 break 

1800 elif x.isdigit(): 

1801 # PDF 1.5+ Cross-Reference Stream 

1802 stream.seek(-1, 1) 

1803 idnum, generation = self.readObjectHeader(stream) 

1804 xrefstream = readObject(stream, self) 

1805 assert xrefstream["/Type"] == "/XRef" 

1806 self.cacheIndirectObject(generation, idnum, xrefstream) 

1807 streamData = BytesIO(b_(xrefstream.getData())) 

1808 # Index pairs specify the subsections in the dictionary. If 

1809 # none create one subsection that spans everything. 

1810 idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) 

1811 if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) 

1812 entrySizes = xrefstream.get("/W") 

1813 assert len(entrySizes) >= 3 

1814 if self.strict and len(entrySizes) > 3: 

1815 raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) 

1816 

1817 def getEntry(i): 

1818 # Reads the correct number of bytes for each entry. See the 

1819 # discussion of the W parameter in PDF spec table 17. 

1820 if entrySizes[i] > 0: 

1821 d = streamData.read(entrySizes[i]) 

1822 return convertToInt(d, entrySizes[i]) 

1823 

1824 # PDF Spec Table 17: A value of zero for an element in the 

1825 # W array indicates...the default value shall be used 

1826 if i == 0: return 1 # First value defaults to 1 

1827 else: return 0 

1828 

1829 def used_before(num, generation): 

1830 # We move backwards through the xrefs, don't replace any. 

1831 return num in self.xref.get(generation, []) or \ 

1832 num in self.xref_objStm 

1833 

1834 # Iterate through each subsection 

1835 last_end = 0 

1836 for start, size in self._pairs(idx_pairs): 

1837 # The subsections must increase 

1838 assert start >= last_end 

1839 last_end = start + size 

1840 for num in range(start, start+size): 

1841 # The first entry is the type 

1842 xref_type = getEntry(0) 

1843 # The rest of the elements depend on the xref_type 

1844 if xref_type == 0: 

1845 # linked list of free objects 

1846 next_free_object = getEntry(1) 

1847 next_generation = getEntry(2) 

1848 elif xref_type == 1: 

1849 # objects that are in use but are not compressed 

1850 byte_offset = getEntry(1) 

1851 generation = getEntry(2) 

1852 if generation not in self.xref: 

1853 self.xref[generation] = {} 

1854 if not used_before(num, generation): 

1855 self.xref[generation][num] = byte_offset 

1856 if debug: print(("XREF Uncompressed: %s %s"%( 

1857 num, generation))) 

1858 elif xref_type == 2: 

1859 # compressed objects 

1860 objstr_num = getEntry(1) 

1861 obstr_idx = getEntry(2) 

1862 generation = 0 # PDF spec table 18, generation is 0 

1863 if not used_before(num, generation): 

1864 if debug: print(("XREF Compressed: %s %s %s"%( 

1865 num, objstr_num, obstr_idx))) 

1866 self.xref_objStm[num] = (objstr_num, obstr_idx) 

1867 elif self.strict: 

1868 raise utils.PdfReadError("Unknown xref type: %s"% 

1869 xref_type) 

1870 

1871 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" 

1872 for key in trailerKeys: 

1873 if key in xrefstream and key not in self.trailer: 

1874 self.trailer[NameObject(key)] = xrefstream.raw_get(key) 

1875 if "/Prev" in xrefstream: 

1876 startxref = xrefstream["/Prev"] 

1877 else: 

1878 break 

1879 else: 

1880 # bad xref character at startxref. Let's see if we can find 

1881 # the xref table nearby, as we've observed this error with an 

1882 # off-by-one before. 

1883 stream.seek(-11, 1) 

1884 tmp = stream.read(20) 

1885 xref_loc = tmp.find(b_("xref")) 

1886 if xref_loc != -1: 

1887 startxref -= (10 - xref_loc) 

1888 continue 

1889 # No explicit xref table, try finding a cross-reference stream. 

1890 stream.seek(startxref, 0) 

1891 found = False 

1892 for look in range(5): 

1893 if stream.read(1).isdigit(): 

1894 # This is not a standard PDF, consider adding a warning 

1895 startxref += look 

1896 found = True 

1897 break 

1898 if found: 

1899 continue 

1900 # no xref table found at specified location 

1901 raise utils.PdfReadError("Could not find xref table at specified location") 

1902 #if not zero-indexed, verify that the table is correct; change it if necessary 

1903 if self.xrefIndex and not self.strict: 

1904 loc = stream.tell() 

1905 for gen in self.xref: 

1906 if gen == 65535: continue 

1907 for id in self.xref[gen]: 

1908 stream.seek(self.xref[gen][id], 0) 

1909 try: 

1910 pid, pgen = self.readObjectHeader(stream) 

1911 except ValueError: 

1912 break 

1913 if pid == id - self.xrefIndex: 

1914 self._zeroXref(gen) 

1915 break 

1916 #if not, then either it's just plain wrong, or the non-zero-index is actually correct 

1917 stream.seek(loc, 0) #return to where it was 

1918 

1919 def _zeroXref(self, generation): 

1920 self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) 

1921 

1922 def _pairs(self, array): 

1923 i = 0 

1924 while True: 

1925 yield array[i], array[i+1] 

1926 i += 2 

1927 if (i+1) >= len(array): 

1928 break 

1929 

1930 def readNextEndLine(self, stream): 

1931 debug = False 

1932 if debug: print(">>readNextEndLine") 

1933 line = b_("") 

1934 while True: 

1935 # Prevent infinite loops in malformed PDFs 

1936 if stream.tell() == 0: 

1937 raise utils.PdfReadError("Could not read malformed PDF file") 

1938 x = stream.read(1) 

1939 if debug: print((" x:", x, "%x"%ord(x))) 

1940 if stream.tell() < 2: 

1941 raise utils.PdfReadError("EOL marker not found") 

1942 stream.seek(-2, 1) 

1943 if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR 

1944 crlf = False 

1945 while x == b_('\n') or x == b_('\r'): 

1946 if debug: 

1947 if ord(x) == 0x0D: print(" x is CR 0D") 

1948 elif ord(x) == 0x0A: print(" x is LF 0A") 

1949 x = stream.read(1) 

1950 if x == b_('\n') or x == b_('\r'): # account for CR+LF 

1951 stream.seek(-1, 1) 

1952 crlf = True 

1953 if stream.tell() < 2: 

1954 raise utils.PdfReadError("EOL marker not found") 

1955 stream.seek(-2, 1) 

1956 stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 

1957 break 

1958 else: 

1959 if debug: print(" x is neither") 

1960 line = x + line 

1961 if debug: print((" RNEL line:", line)) 

1962 if debug: print("leaving RNEL") 

1963 return line 

1964 

1965 def decrypt(self, password): 

1966 """ 

1967 When using an encrypted / secured PDF file with the PDF Standard 

1968 encryption handler, this function will allow the file to be decrypted. 

1969 It checks the given password against the document's user password and 

1970 owner password, and then stores the resulting decryption key if either 

1971 password is correct. 

1972 

1973 It does not matter which password was matched. Both passwords provide 

1974 the correct decryption key that will allow the document to be used with 

1975 this library. 

1976 

1977 :param str password: The password to match. 

1978 :return: ``0`` if the password failed, ``1`` if the password matched the user 

1979 password, and ``2`` if the password matched the owner password. 

1980 :rtype: int 

1981 :raises NotImplementedError: if document uses an unsupported encryption 

1982 method. 

1983 """ 

1984 

1985 self._override_encryption = True 

1986 try: 

1987 return self._decrypt(password) 

1988 finally: 

1989 self._override_encryption = False 

1990 

1991 def _decrypt(self, password): 

1992 encrypt = self.trailer['/Encrypt'].getObject() 

1993 if encrypt['/Filter'] != '/Standard': 

1994 raise NotImplementedError("only Standard PDF encryption handler is available") 

1995 if not (encrypt['/V'] in (1, 2)): 

1996 raise NotImplementedError("only algorithm code 1 and 2 are supported") 

1997 user_password, key = self._authenticateUserPassword(password) 

1998 if user_password: 

1999 self._decryption_key = key 

2000 return 1 

2001 else: 

2002 rev = encrypt['/R'].getObject() 

2003 if rev == 2: 

2004 keylen = 5 

2005 else: 

2006 keylen = encrypt['/Length'].getObject() // 8 

2007 key = _alg33_1(password, rev, keylen) 

2008 real_O = encrypt["/O"].getObject() 

2009 if rev == 2: 

2010 userpass = utils.RC4_encrypt(key, real_O) 

2011 else: 

2012 val = real_O 

2013 for i in range(19, -1, -1): 

2014 new_key = b_('') 

2015 for l in range(len(key)): 

2016 new_key += b_(chr(utils.ord_(key[l]) ^ i)) 

2017 val = utils.RC4_encrypt(new_key, val) 

2018 userpass = val 

2019 owner_password, key = self._authenticateUserPassword(userpass) 

2020 if owner_password: 

2021 self._decryption_key = key 

2022 return 2 

2023 return 0 

2024 

2025 def _authenticateUserPassword(self, password): 

2026 encrypt = self.trailer['/Encrypt'].getObject() 

2027 rev = encrypt['/R'].getObject() 

2028 owner_entry = encrypt['/O'].getObject() 

2029 p_entry = encrypt['/P'].getObject() 

2030 id_entry = self.trailer['/ID'].getObject() 

2031 id1_entry = id_entry[0].getObject() 

2032 real_U = encrypt['/U'].getObject().original_bytes 

2033 if rev == 2: 

2034 U, key = _alg34(password, owner_entry, p_entry, id1_entry) 

2035 elif rev >= 3: 

2036 U, key = _alg35(password, rev, 

2037 encrypt["/Length"].getObject() // 8, owner_entry, 

2038 p_entry, id1_entry, 

2039 encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) 

2040 U, real_U = U[:16], real_U[:16] 

2041 return U == real_U, key 

2042 

2043 def getIsEncrypted(self): 

2044 return "/Encrypt" in self.trailer 

2045 

2046 isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) 

2047 """ 

2048 Read-only boolean property showing whether this PDF file is encrypted. 

2049 Note that this property, if true, will remain true even after the 

2050 :meth:`decrypt()<PdfFileReader.decrypt>` method is called. 

2051 """ 

2052 

2053 

2054def getRectangle(self, name, defaults): 

2055 retval = self.get(name) 

2056 if isinstance(retval, RectangleObject): 

2057 return retval 

2058 if retval == None: 

2059 for d in defaults: 

2060 retval = self.get(d) 

2061 if retval != None: 

2062 break 

2063 if isinstance(retval, IndirectObject): 

2064 retval = self.pdf.getObject(retval) 

2065 retval = RectangleObject(retval) 

2066 setRectangle(self, name, retval) 

2067 return retval 

2068 

2069 

2070def setRectangle(self, name, value): 

2071 if not isinstance(name, NameObject): 

2072 name = NameObject(name) 

2073 self[name] = value 

2074 

2075 

2076def deleteRectangle(self, name): 

2077 del self[name] 

2078 

2079 

2080def createRectangleAccessor(name, fallback): 

2081 return \ 

2082 property( 

2083 lambda self: getRectangle(self, name, fallback), 

2084 lambda self, value: setRectangle(self, name, value), 

2085 lambda self: deleteRectangle(self, name) 

2086 ) 

2087 

2088 

2089class PageObject(DictionaryObject): 

2090 """ 

2091 This class represents a single page within a PDF file. Typically this 

2092 object will be created by accessing the 

2093 :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the 

2094 :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is 

2095 also possible to create an empty page with the 

2096 :meth:`createBlankPage()<PageObject.createBlankPage>` static method. 

2097 

2098 :param pdf: PDF file the page belongs to. 

2099 :param indirectRef: Stores the original indirect reference to 

2100 this object in its source PDF 

2101 """ 

2102 def __init__(self, pdf=None, indirectRef=None): 

2103 DictionaryObject.__init__(self) 

2104 self.pdf = pdf 

2105 self.indirectRef = indirectRef 

2106 

2107 def createBlankPage(pdf=None, width=None, height=None): 

2108 """ 

2109 Returns a new blank page. 

2110 If ``width`` or ``height`` is ``None``, try to get the page size 

2111 from the last page of *pdf*. 

2112 

2113 :param pdf: PDF file the page belongs to 

2114 :param float width: The width of the new page expressed in default user 

2115 space units. 

2116 :param float height: The height of the new page expressed in default user 

2117 space units. 

2118 :return: the new blank page: 

2119 :rtype: :class:`PageObject<PageObject>` 

2120 :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains 

2121 no page 

2122 """ 

2123 page = PageObject(pdf) 

2124 

2125 # Creates a new page (cf PDF Reference 7.7.3.3) 

2126 page.__setitem__(NameObject('/Type'), NameObject('/Page')) 

2127 page.__setitem__(NameObject('/Parent'), NullObject()) 

2128 page.__setitem__(NameObject('/Resources'), DictionaryObject()) 

2129 if width is None or height is None: 

2130 if pdf is not None and pdf.getNumPages() > 0: 

2131 lastpage = pdf.getPage(pdf.getNumPages() - 1) 

2132 width = lastpage.mediaBox.getWidth() 

2133 height = lastpage.mediaBox.getHeight() 

2134 else: 

2135 raise utils.PageSizeNotDefinedError() 

2136 page.__setitem__(NameObject('/MediaBox'), 

2137 RectangleObject([0, 0, width, height])) 

2138 

2139 return page 

2140 createBlankPage = staticmethod(createBlankPage) 

2141 

2142 def rotateClockwise(self, angle): 

2143 """ 

2144 Rotates a page clockwise by increments of 90 degrees. 

2145 

2146 :param int angle: Angle to rotate the page. Must be an increment 

2147 of 90 deg. 

2148 """ 

2149 assert angle % 90 == 0 

2150 self._rotate(angle) 

2151 return self 

2152 

2153 def rotateCounterClockwise(self, angle): 

2154 """ 

2155 Rotates a page counter-clockwise by increments of 90 degrees. 

2156 

2157 :param int angle: Angle to rotate the page. Must be an increment 

2158 of 90 deg. 

2159 """ 

2160 assert angle % 90 == 0 

2161 self._rotate(-angle) 

2162 return self 

2163 

2164 def _rotate(self, angle): 

2165 currentAngle = self.get("/Rotate", 0) 

2166 self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) 

2167 

2168 def _mergeResources(res1, res2, resource): 

2169 newRes = DictionaryObject() 

2170 newRes.update(res1.get(resource, DictionaryObject()).getObject()) 

2171 page2Res = res2.get(resource, DictionaryObject()).getObject() 

2172 renameRes = {} 

2173 for key in list(page2Res.keys()): 

2174 if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key): 

2175 newname = NameObject(key + str(uuid.uuid4())) 

2176 renameRes[key] = newname 

2177 newRes[newname] = page2Res[key] 

2178 elif key not in newRes: 

2179 newRes[key] = page2Res.raw_get(key) 

2180 return newRes, renameRes 

2181 _mergeResources = staticmethod(_mergeResources) 

2182 

2183 def _contentStreamRename(stream, rename, pdf): 

2184 if not rename: 

2185 return stream 

2186 stream = ContentStream(stream, pdf) 

2187 for operands, operator in stream.operations: 

2188 for i in range(len(operands)): 

2189 op = operands[i] 

2190 if isinstance(op, NameObject): 

2191 operands[i] = rename.get(op,op) 

2192 return stream 

2193 _contentStreamRename = staticmethod(_contentStreamRename) 

2194 

2195 def _pushPopGS(contents, pdf): 

2196 # adds a graphics state "push" and "pop" to the beginning and end 

2197 # of a content stream. This isolates it from changes such as 

2198 # transformation matricies. 

2199 stream = ContentStream(contents, pdf) 

2200 stream.operations.insert(0, [[], "q"]) 

2201 stream.operations.append([[], "Q"]) 

2202 return stream 

2203 _pushPopGS = staticmethod(_pushPopGS) 

2204 

2205 def _addTransformationMatrix(contents, pdf, ctm): 

2206 # adds transformation matrix at the beginning of the given 

2207 # contents stream. 

2208 a, b, c, d, e, f = ctm 

2209 contents = ContentStream(contents, pdf) 

2210 contents.operations.insert(0, [[FloatObject(a), FloatObject(b), 

2211 FloatObject(c), FloatObject(d), FloatObject(e), 

2212 FloatObject(f)], " cm"]) 

2213 return contents 

2214 _addTransformationMatrix = staticmethod(_addTransformationMatrix) 

2215 

2216 def getContents(self): 

2217 """ 

2218 Accesses the page contents. 

2219 

2220 :return: the ``/Contents`` object, or ``None`` if it doesn't exist. 

2221 ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 

2222 """ 

2223 if "/Contents" in self: 

2224 return self["/Contents"].getObject() 

2225 else: 

2226 return None 

2227 

2228 def mergePage(self, page2): 

2229 """ 

2230 Merges the content streams of two pages into one. Resource references 

2231 (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc 

2232 of this page are not altered. The parameter page's content stream will 

2233 be added to the end of this page's content stream, meaning that it will 

2234 be drawn after, or "on top" of this page. 

2235 

2236 :param PageObject page2: The page to be merged into this one. Should be 

2237 an instance of :class:`PageObject<PageObject>`. 

2238 """ 

2239 self._mergePage(page2) 

2240 

2241 def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): 

2242 # First we work on merging the resource dictionaries. This allows us 

2243 # to find out what symbols in the content streams we might need to 

2244 # rename. 

2245 

2246 newResources = DictionaryObject() 

2247 rename = {} 

2248 originalResources = self["/Resources"].getObject() 

2249 page2Resources = page2["/Resources"].getObject() 

2250 newAnnots = ArrayObject() 

2251 

2252 for page in (self, page2): 

2253 if "/Annots" in page: 

2254 annots = page["/Annots"] 

2255 if isinstance(annots, ArrayObject): 

2256 for ref in annots: 

2257 newAnnots.append(ref) 

2258 

2259 for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": 

2260 new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) 

2261 if new: 

2262 newResources[NameObject(res)] = new 

2263 rename.update(newrename) 

2264 

2265 # Combine /ProcSet sets. 

2266 newResources[NameObject("/ProcSet")] = ArrayObject( 

2267 frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( 

2268 frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) 

2269 ) 

2270 ) 

2271 

2272 newContentArray = ArrayObject() 

2273 

2274 originalContent = self.getContents() 

2275 if originalContent is not None: 

2276 newContentArray.append(PageObject._pushPopGS( 

2277 originalContent, self.pdf)) 

2278 

2279 page2Content = page2.getContents() 

2280 if page2Content is not None: 

2281 if page2transformation is not None: 

2282 page2Content = page2transformation(page2Content) 

2283 page2Content = PageObject._contentStreamRename( 

2284 page2Content, rename, self.pdf) 

2285 page2Content = PageObject._pushPopGS(page2Content, self.pdf) 

2286 newContentArray.append(page2Content) 

2287 

2288 # if expanding the page to fit a new page, calculate the new media box size 

2289 if expand: 

2290 corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), 

2291 self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] 

2292 corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), 

2293 page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), 

2294 page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), 

2295 page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] 

2296 if ctm is not None: 

2297 ctm = [float(x) for x in ctm] 

2298 new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)] 

2299 new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)] 

2300 else: 

2301 new_x = corners2[0:8:2] 

2302 new_y = corners2[1:8:2] 

2303 lowerleft = [min(new_x), min(new_y)] 

2304 upperright = [max(new_x), max(new_y)] 

2305 lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] 

2306 upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])] 

2307 

2308 self.mediaBox.setLowerLeft(lowerleft) 

2309 self.mediaBox.setUpperRight(upperright) 

2310 

2311 self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) 

2312 self[NameObject('/Resources')] = newResources 

2313 self[NameObject('/Annots')] = newAnnots 

2314 

2315 def mergeTransformedPage(self, page2, ctm, expand=False): 

2316 """ 

2317 This is similar to mergePage, but a transformation matrix is 

2318 applied to the merged stream. 

2319 

2320 :param PageObject page2: The page to be merged into this one. Should be 

2321 an instance of :class:`PageObject<PageObject>`. 

2322 :param tuple ctm: a 6-element tuple containing the operands of the 

2323 transformation matrix 

2324 :param bool expand: Whether the page should be expanded to fit the dimensions 

2325 of the page to be merged. 

2326 """ 

2327 self._mergePage(page2, lambda page2Content: 

2328 PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand) 

2329 

2330 def mergeScaledPage(self, page2, scale, expand=False): 

2331 """ 

2332 This is similar to mergePage, but the stream to be merged is scaled 

2333 by appling a transformation matrix. 

2334 

2335 :param PageObject page2: The page to be merged into this one. Should be 

2336 an instance of :class:`PageObject<PageObject>`. 

2337 :param float scale: The scaling factor 

2338 :param bool expand: Whether the page should be expanded to fit the 

2339 dimensions of the page to be merged. 

2340 """ 

2341 # CTM to scale : [ sx 0 0 sy 0 0 ] 

2342 return self.mergeTransformedPage(page2, [scale, 0, 

2343 0, scale, 

2344 0, 0], expand) 

2345 

2346 def mergeRotatedPage(self, page2, rotation, expand=False): 

2347 """ 

2348 This is similar to mergePage, but the stream to be merged is rotated 

2349 by appling a transformation matrix. 

2350 

2351 :param PageObject page2: the page to be merged into this one. Should be 

2352 an instance of :class:`PageObject<PageObject>`. 

2353 :param float rotation: The angle of the rotation, in degrees 

2354 :param bool expand: Whether the page should be expanded to fit the 

2355 dimensions of the page to be merged. 

2356 """ 

2357 rotation = math.radians(rotation) 

2358 return self.mergeTransformedPage(page2, 

2359 [math.cos(rotation), math.sin(rotation), 

2360 -math.sin(rotation), math.cos(rotation), 

2361 0, 0], expand) 

2362 

2363 def mergeTranslatedPage(self, page2, tx, ty, expand=False): 

2364 """ 

2365 This is similar to mergePage, but the stream to be merged is translated 

2366 by appling a transformation matrix. 

2367 

2368 :param PageObject page2: the page to be merged into this one. Should be 

2369 an instance of :class:`PageObject<PageObject>`. 

2370 :param float tx: The translation on X axis 

2371 :param float ty: The translation on Y axis 

2372 :param bool expand: Whether the page should be expanded to fit the 

2373 dimensions of the page to be merged. 

2374 """ 

2375 return self.mergeTransformedPage(page2, [1, 0, 

2376 0, 1, 

2377 tx, ty], expand) 

2378 

2379 def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): 

2380 """ 

2381 This is similar to mergePage, but the stream to be merged is rotated 

2382 and translated by appling a transformation matrix. 

2383 

2384 :param PageObject page2: the page to be merged into this one. Should be 

2385 an instance of :class:`PageObject<PageObject>`. 

2386 :param float tx: The translation on X axis 

2387 :param float ty: The translation on Y axis 

2388 :param float rotation: The angle of the rotation, in degrees 

2389 :param bool expand: Whether the page should be expanded to fit the 

2390 dimensions of the page to be merged. 

2391 """ 

2392 

2393 translation = [[1, 0, 0], 

2394 [0, 1, 0], 

2395 [-tx, -ty, 1]] 

2396 rotation = math.radians(rotation) 

2397 rotating = [[math.cos(rotation), math.sin(rotation), 0], 

2398 [-math.sin(rotation), math.cos(rotation), 0], 

2399 [0, 0, 1]] 

2400 rtranslation = [[1, 0, 0], 

2401 [0, 1, 0], 

2402 [tx, ty, 1]] 

2403 ctm = utils.matrixMultiply(translation, rotating) 

2404 ctm = utils.matrixMultiply(ctm, rtranslation) 

2405 

2406 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 

2407 ctm[1][0], ctm[1][1], 

2408 ctm[2][0], ctm[2][1]], expand) 

2409 

2410 def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): 

2411 """ 

2412 This is similar to mergePage, but the stream to be merged is rotated 

2413 and scaled by appling a transformation matrix. 

2414 

2415 :param PageObject page2: the page to be merged into this one. Should be 

2416 an instance of :class:`PageObject<PageObject>`. 

2417 :param float rotation: The angle of the rotation, in degrees 

2418 :param float scale: The scaling factor 

2419 :param bool expand: Whether the page should be expanded to fit the 

2420 dimensions of the page to be merged. 

2421 """ 

2422 rotation = math.radians(rotation) 

2423 rotating = [[math.cos(rotation), math.sin(rotation), 0], 

2424 [-math.sin(rotation), math.cos(rotation), 0], 

2425 [0, 0, 1]] 

2426 scaling = [[scale, 0, 0], 

2427 [0, scale, 0], 

2428 [0, 0, 1]] 

2429 ctm = utils.matrixMultiply(rotating, scaling) 

2430 

2431 return self.mergeTransformedPage(page2, 

2432 [ctm[0][0], ctm[0][1], 

2433 ctm[1][0], ctm[1][1], 

2434 ctm[2][0], ctm[2][1]], expand) 

2435 

2436 def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): 

2437 """ 

2438 This is similar to mergePage, but the stream to be merged is translated 

2439 and scaled by appling a transformation matrix. 

2440 

2441 :param PageObject page2: the page to be merged into this one. Should be 

2442 an instance of :class:`PageObject<PageObject>`. 

2443 :param float scale: The scaling factor 

2444 :param float tx: The translation on X axis 

2445 :param float ty: The translation on Y axis 

2446 :param bool expand: Whether the page should be expanded to fit the 

2447 dimensions of the page to be merged. 

2448 """ 

2449 

2450 translation = [[1, 0, 0], 

2451 [0, 1, 0], 

2452 [tx, ty, 1]] 

2453 scaling = [[scale, 0, 0], 

2454 [0, scale, 0], 

2455 [0, 0, 1]] 

2456 ctm = utils.matrixMultiply(scaling, translation) 

2457 

2458 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 

2459 ctm[1][0], ctm[1][1], 

2460 ctm[2][0], ctm[2][1]], expand) 

2461 

2462 def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): 

2463 """ 

2464 This is similar to mergePage, but the stream to be merged is translated, 

2465 rotated and scaled by appling a transformation matrix. 

2466 

2467 :param PageObject page2: the page to be merged into this one. Should be 

2468 an instance of :class:`PageObject<PageObject>`. 

2469 :param float tx: The translation on X axis 

2470 :param float ty: The translation on Y axis 

2471 :param float rotation: The angle of the rotation, in degrees 

2472 :param float scale: The scaling factor 

2473 :param bool expand: Whether the page should be expanded to fit the 

2474 dimensions of the page to be merged. 

2475 """ 

2476 translation = [[1, 0, 0], 

2477 [0, 1, 0], 

2478 [tx, ty, 1]] 

2479 rotation = math.radians(rotation) 

2480 rotating = [[math.cos(rotation), math.sin(rotation), 0], 

2481 [-math.sin(rotation), math.cos(rotation), 0], 

2482 [0, 0, 1]] 

2483 scaling = [[scale, 0, 0], 

2484 [0, scale, 0], 

2485 [0, 0, 1]] 

2486 ctm = utils.matrixMultiply(rotating, scaling) 

2487 ctm = utils.matrixMultiply(ctm, translation) 

2488 

2489 return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 

2490 ctm[1][0], ctm[1][1], 

2491 ctm[2][0], ctm[2][1]], expand) 

2492 

2493 ## 

2494 # Applys a transformation matrix the page. 

2495 # 

2496 # @param ctm A 6 elements tuple containing the operands of the 

2497 # transformation matrix 

2498 def addTransformation(self, ctm): 

2499 """ 

2500 Applies a transformation matrix to the page. 

2501 

2502 :param tuple ctm: A 6-element tuple containing the operands of the 

2503 transformation matrix. 

2504 """ 

2505 originalContent = self.getContents() 

2506 if originalContent is not None: 

2507 newContent = PageObject._addTransformationMatrix( 

2508 originalContent, self.pdf, ctm) 

2509 newContent = PageObject._pushPopGS(newContent, self.pdf) 

2510 self[NameObject('/Contents')] = newContent 

2511 

2512 def scale(self, sx, sy): 

2513 """ 

2514 Scales a page by the given factors by appling a transformation 

2515 matrix to its content and updating the page size. 

2516 

2517 :param float sx: The scaling factor on horizontal axis. 

2518 :param float sy: The scaling factor on vertical axis. 

2519 """ 

2520 self.addTransformation([sx, 0, 

2521 0, sy, 

2522 0, 0]) 

2523 self.mediaBox = RectangleObject([ 

2524 float(self.mediaBox.getLowerLeft_x()) * sx, 

2525 float(self.mediaBox.getLowerLeft_y()) * sy, 

2526 float(self.mediaBox.getUpperRight_x()) * sx, 

2527 float(self.mediaBox.getUpperRight_y()) * sy]) 

2528 if "/VP" in self: 

2529 viewport = self["/VP"] 

2530 if isinstance(viewport, ArrayObject): 

2531 bbox = viewport[0]["/BBox"] 

2532 else: 

2533 bbox = viewport["/BBox"] 

2534 scaled_bbox = RectangleObject([ 

2535 float(bbox[0]) * sx, 

2536 float(bbox[1]) * sy, 

2537 float(bbox[2]) * sx, 

2538 float(bbox[3]) * sy]) 

2539 if isinstance(viewport, ArrayObject): 

2540 self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox 

2541 else: 

2542 self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox 

2543 

2544 def scaleBy(self, factor): 

2545 """ 

2546 Scales a page by the given factor by appling a transformation 

2547 matrix to its content and updating the page size. 

2548 

2549 :param float factor: The scaling factor (for both X and Y axis). 

2550 """ 

2551 self.scale(factor, factor) 

2552 

2553 def scaleTo(self, width, height): 

2554 """ 

2555 Scales a page to the specified dimentions by appling a 

2556 transformation matrix to its content and updating the page size. 

2557 

2558 :param float width: The new width. 

2559 :param float height: The new heigth. 

2560 """ 

2561 sx = width / float(self.mediaBox.getUpperRight_x() - 

2562 self.mediaBox.getLowerLeft_x ()) 

2563 sy = height / float(self.mediaBox.getUpperRight_y() - 

2564 self.mediaBox.getLowerLeft_y ()) 

2565 self.scale(sx, sy) 

2566 

2567 def compressContentStreams(self): 

2568 """ 

2569 Compresses the size of this page by joining all content streams and 

2570 applying a FlateDecode filter. 

2571 

2572 However, it is possible that this function will perform no action if 

2573 content stream compression becomes "automatic" for some reason. 

2574 """ 

2575 content = self.getContents() 

2576 if content is not None: 

2577 if not isinstance(content, ContentStream): 

2578 content = ContentStream(content, self.pdf) 

2579 self[NameObject("/Contents")] = content.flateEncode() 

2580 

2581 def extractText(self): 

2582 """ 

2583 Locate all text drawing commands, in the order they are provided in the 

2584 content stream, and extract the text. This works well for some PDF 

2585 files, but poorly for others, depending on the generator used. This will 

2586 be refined in the future. Do not rely on the order of text coming out of 

2587 this function, as it will change if this function is made more 

2588 sophisticated. 

2589 

2590 :return: a unicode string object. 

2591 """ 

2592 text = u_("") 

2593 content = self["/Contents"].getObject() 

2594 if not isinstance(content, ContentStream): 

2595 content = ContentStream(content, self.pdf) 

2596 # Note: we check all strings are TextStringObjects. ByteStringObjects 

2597 # are strings where the byte->string encoding was unknown, so adding 

2598 # them to the text here would be gibberish. 

2599 for operands, operator in content.operations: 

2600 if operator == b_("Tj"): 

2601 _text = operands[0] 

2602 if isinstance(_text, TextStringObject): 

2603 text += _text 

2604 elif operator == b_("T*"): 

2605 text += "\n" 

2606 elif operator == b_("'"): 

2607 text += "\n" 

2608 _text = operands[0] 

2609 if isinstance(_text, TextStringObject): 

2610 text += operands[0] 

2611 elif operator == b_('"'): 

2612 _text = operands[2] 

2613 if isinstance(_text, TextStringObject): 

2614 text += "\n" 

2615 text += _text 

2616 elif operator == b_("TJ"): 

2617 for i in operands[0]: 

2618 if isinstance(i, TextStringObject): 

2619 text += i 

2620 text += "\n" 

2621 return text 

2622 

2623 mediaBox = createRectangleAccessor("/MediaBox", ()) 

2624 """ 

2625 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 

2626 defining the boundaries of the physical medium on which the page is 

2627 intended to be displayed or printed. 

2628 """ 

2629 

2630 cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) 

2631 """ 

2632 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 

2633 defining the visible region of default user space. When the page is 

2634 displayed or printed, its contents are to be clipped (cropped) to this 

2635 rectangle and then imposed on the output medium in some 

2636 implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`. 

2637 """ 

2638 

2639 bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) 

2640 """ 

2641 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 

2642 defining the region to which the contents of the page should be clipped 

2643 when output in a production enviroment. 

2644 """ 

2645 

2646 trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) 

2647 """ 

2648 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 

2649 defining the intended dimensions of the finished page after trimming. 

2650 """ 

2651 

2652 artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) 

2653 """ 

2654 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units, 

2655 defining the extent of the page's meaningful content as intended by the 

2656 page's creator. 

2657 """ 

2658 

2659 

2660class ContentStream(DecodedStreamObject): 

2661 def __init__(self, stream, pdf): 

2662 self.pdf = pdf 

2663 self.operations = [] 

2664 # stream may be a StreamObject or an ArrayObject containing 

2665 # multiple StreamObjects to be cat'd together. 

2666 stream = stream.getObject() 

2667 if isinstance(stream, ArrayObject): 

2668 data = b_("") 

2669 for s in stream: 

2670 data += s.getObject().getData() 

2671 stream = BytesIO(b_(data)) 

2672 else: 

2673 stream = BytesIO(b_(stream.getData())) 

2674 self.__parseContentStream(stream) 

2675 

2676 def __parseContentStream(self, stream): 

2677 # file("f:\\tmp.txt", "w").write(stream.read()) 

2678 stream.seek(0, 0) 

2679 operands = [] 

2680 while True: 

2681 peek = readNonWhitespace(stream) 

2682 if peek == b_('') or ord_(peek) == 0: 

2683 break 

2684 stream.seek(-1, 1) 

2685 if peek.isalpha() or peek == b_("'") or peek == b_('"'): 

2686 operator = utils.readUntilRegex(stream, 

2687 NameObject.delimiterPattern, True) 

2688 if operator == b_("BI"): 

2689 # begin inline image - a completely different parsing 

2690 # mechanism is required, of course... thanks buddy... 

2691 assert operands == [] 

2692 ii = self._readInlineImage(stream) 

2693 self.operations.append((ii, b_("INLINE IMAGE"))) 

2694 else: 

2695 self.operations.append((operands, operator)) 

2696 operands = [] 

2697 elif peek == b_('%'): 

2698 # If we encounter a comment in the content stream, we have to 

2699 # handle it here. Typically, readObject will handle 

2700 # encountering a comment -- but readObject assumes that 

2701 # following the comment must be the object we're trying to 

2702 # read. In this case, it could be an operator instead. 

2703 while peek not in (b_('\r'), b_('\n')): 

2704 peek = stream.read(1) 

2705 else: 

2706 operands.append(readObject(stream, None)) 

2707 

2708 def _readInlineImage(self, stream): 

2709 # begin reading just after the "BI" - begin image 

2710 # first read the dictionary of settings. 

2711 settings = DictionaryObject() 

2712 while True: 

2713 tok = readNonWhitespace(stream) 

2714 stream.seek(-1, 1) 

2715 if tok == b_("I"): 

2716 # "ID" - begin of image data 

2717 break 

2718 key = readObject(stream, self.pdf) 

2719 tok = readNonWhitespace(stream) 

2720 stream.seek(-1, 1) 

2721 value = readObject(stream, self.pdf) 

2722 settings[key] = value 

2723 # left at beginning of ID 

2724 tmp = stream.read(3) 

2725 assert tmp[:2] == b_("ID") 

2726 data = b_("") 

2727 while True: 

2728 # Read the inline image, while checking for EI (End Image) operator. 

2729 tok = stream.read(1) 

2730 if tok == b_("E"): 

2731 # Check for End Image 

2732 tok2 = stream.read(1) 

2733 if tok2 == b_("I"): 

2734 # Data can contain EI, so check for the Q operator. 

2735 tok3 = stream.read(1) 

2736 info = tok + tok2 

2737 # We need to find whitespace between EI and Q. 

2738 has_q_whitespace = False 

2739 while tok3 in utils.WHITESPACES: 

2740 has_q_whitespace = True 

2741 info += tok3 

2742 tok3 = stream.read(1) 

2743 if tok3 == b_("Q") and has_q_whitespace: 

2744 stream.seek(-1, 1) 

2745 break 

2746 else: 

2747 stream.seek(-1,1) 

2748 data += info 

2749 else: 

2750 stream.seek(-1, 1) 

2751 data += tok 

2752 else: 

2753 data += tok 

2754 return {"settings": settings, "data": data} 

2755 

2756 def _getData(self): 

2757 newdata = BytesIO() 

2758 for operands, operator in self.operations: 

2759 if operator == b_("INLINE IMAGE"): 

2760 newdata.write(b_("BI")) 

2761 dicttext = BytesIO() 

2762 operands["settings"].writeToStream(dicttext, None) 

2763 newdata.write(dicttext.getvalue()[2:-2]) 

2764 newdata.write(b_("ID ")) 

2765 newdata.write(operands["data"]) 

2766 newdata.write(b_("EI")) 

2767 else: 

2768 for op in operands: 

2769 op.writeToStream(newdata, None) 

2770 newdata.write(b_(" ")) 

2771 newdata.write(b_(operator)) 

2772 newdata.write(b_("\n")) 

2773 return newdata.getvalue() 

2774 

2775 def _setData(self, value): 

2776 self.__parseContentStream(BytesIO(b_(value))) 

2777 

2778 _data = property(_getData, _setData) 

2779 

2780 

2781class DocumentInformation(DictionaryObject): 

2782 """ 

2783 A class representing the basic document metadata provided in a PDF File. 

2784 This class is accessible through 

2785 :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>` 

2786 

2787 All text properties of the document metadata have 

2788 *two* properties, eg. author and author_raw. The non-raw property will 

2789 always return a ``TextStringObject``, making it ideal for a case where 

2790 the metadata is being displayed. The raw property can sometimes return 

2791 a ``ByteStringObject``, if PyPDF2 was unable to decode the string's 

2792 text encoding; this requires additional safety in the caller and 

2793 therefore is not as commonly accessed. 

2794 """ 

2795 

2796 def __init__(self): 

2797 DictionaryObject.__init__(self) 

2798 

2799 def getText(self, key): 

2800 retval = self.get(key, None) 

2801 if isinstance(retval, TextStringObject): 

2802 return retval 

2803 return None 

2804 

2805 title = property(lambda self: self.getText("/Title")) 

2806 """Read-only property accessing the document's **title**. 

2807 Returns a unicode string (``TextStringObject``) or ``None`` 

2808 if the title is not specified.""" 

2809 title_raw = property(lambda self: self.get("/Title")) 

2810 """The "raw" version of title; can return a ``ByteStringObject``.""" 

2811 

2812 author = property(lambda self: self.getText("/Author")) 

2813 """Read-only property accessing the document's **author**. 

2814 Returns a unicode string (``TextStringObject``) or ``None`` 

2815 if the author is not specified.""" 

2816 author_raw = property(lambda self: self.get("/Author")) 

2817 """The "raw" version of author; can return a ``ByteStringObject``.""" 

2818 

2819 subject = property(lambda self: self.getText("/Subject")) 

2820 """Read-only property accessing the document's **subject**. 

2821 Returns a unicode string (``TextStringObject``) or ``None`` 

2822 if the subject is not specified.""" 

2823 subject_raw = property(lambda self: self.get("/Subject")) 

2824 """The "raw" version of subject; can return a ``ByteStringObject``.""" 

2825 

2826 creator = property(lambda self: self.getText("/Creator")) 

2827 """Read-only property accessing the document's **creator**. If the 

2828 document was converted to PDF from another format, this is the name of the 

2829 application (e.g. OpenOffice) that created the original document from 

2830 which it was converted. Returns a unicode string (``TextStringObject``) 

2831 or ``None`` if the creator is not specified.""" 

2832 creator_raw = property(lambda self: self.get("/Creator")) 

2833 """The "raw" version of creator; can return a ``ByteStringObject``.""" 

2834 

2835 producer = property(lambda self: self.getText("/Producer")) 

2836 """Read-only property accessing the document's **producer**. 

2837 If the document was converted to PDF from another format, this is 

2838 the name of the application (for example, OSX Quartz) that converted 

2839 it to PDF. Returns a unicode string (``TextStringObject``) 

2840 or ``None`` if the producer is not specified.""" 

2841 producer_raw = property(lambda self: self.get("/Producer")) 

2842 """The "raw" version of producer; can return a ``ByteStringObject``.""" 

2843 

2844 

2845def convertToInt(d, size): 

2846 if size > 8: 

2847 raise utils.PdfReadError("invalid size in convertToInt") 

2848 d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) 

2849 d = d[-8:] 

2850 return struct.unpack(">q", d)[0] 

2851 

2852# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 

2853_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \ 

2854 b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ 

2855 b_('\xa9\xfe\x64\x53\x69\x7a') 

2856 

2857 

2858# Implementation of algorithm 3.2 of the PDF standard security handler, 

2859# section 3.5.2 of the PDF 1.6 reference. 

2860def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): 

2861 # 1. Pad or truncate the password string to exactly 32 bytes. If the 

2862 # password string is more than 32 bytes long, use only its first 32 bytes; 

2863 # if it is less than 32 bytes long, pad it by appending the required number 

2864 # of additional bytes from the beginning of the padding string 

2865 # (_encryption_padding). 

2866 password = b_((str_(password) + str_(_encryption_padding))[:32]) 

2867 # 2. Initialize the MD5 hash function and pass the result of step 1 as 

2868 # input to this function. 

2869 import struct 

2870 m = md5(password) 

2871 # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash 

2872 # function. 

2873 m.update(owner_entry.original_bytes) 

2874 # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass 

2875 # these bytes to the MD5 hash function, low-order byte first. 

2876 p_entry = struct.pack('<i', p_entry) 

2877 m.update(p_entry) 

2878 # 5. Pass the first element of the file's file identifier array to the MD5 

2879 # hash function. 

2880 m.update(id1_entry.original_bytes) 

2881 # 6. (Revision 3 or greater) If document metadata is not being encrypted, 

2882 # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. 

2883 if rev >= 3 and not metadata_encrypt: 

2884 m.update(b_("\xff\xff\xff\xff")) 

2885 # 7. Finish the hash. 

2886 md5_hash = m.digest() 

2887 # 8. (Revision 3 or greater) Do the following 50 times: Take the output 

2888 # from the previous MD5 hash and pass the first n bytes of the output as 

2889 # input into a new MD5 hash, where n is the number of bytes of the 

2890 # encryption key as defined by the value of the encryption dictionary's 

2891 # /Length entry. 

2892 if rev >= 3: 

2893 for i in range(50): 

2894 md5_hash = md5(md5_hash[:keylen]).digest() 

2895 # 9. Set the encryption key to the first n bytes of the output from the 

2896 # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or 

2897 # greater, depends on the value of the encryption dictionary's /Length 

2898 # entry. 

2899 return md5_hash[:keylen] 

2900 

2901 

2902# Implementation of algorithm 3.3 of the PDF standard security handler, 

2903# section 3.5.2 of the PDF 1.6 reference. 

2904def _alg33(owner_pwd, user_pwd, rev, keylen): 

2905 # steps 1 - 4 

2906 key = _alg33_1(owner_pwd, rev, keylen) 

2907 # 5. Pad or truncate the user password string as described in step 1 of 

2908 # algorithm 3.2. 

2909 user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) 

2910 # 6. Encrypt the result of step 5, using an RC4 encryption function with 

2911 # the encryption key obtained in step 4. 

2912 val = utils.RC4_encrypt(key, user_pwd) 

2913 # 7. (Revision 3 or greater) Do the following 19 times: Take the output 

2914 # from the previous invocation of the RC4 function and pass it as input to 

2915 # a new invocation of the function; use an encryption key generated by 

2916 # taking each byte of the encryption key obtained in step 4 and performing 

2917 # an XOR operation between that byte and the single-byte value of the 

2918 # iteration counter (from 1 to 19). 

2919 if rev >= 3: 

2920 for i in range(1, 20): 

2921 new_key = '' 

2922 for l in range(len(key)): 

2923 new_key += chr(ord_(key[l]) ^ i) 

2924 val = utils.RC4_encrypt(new_key, val) 

2925 # 8. Store the output from the final invocation of the RC4 as the value of 

2926 # the /O entry in the encryption dictionary. 

2927 return val 

2928 

2929 

2930# Steps 1-4 of algorithm 3.3 

2931def _alg33_1(password, rev, keylen): 

2932 # 1. Pad or truncate the owner password string as described in step 1 of 

2933 # algorithm 3.2. If there is no owner password, use the user password 

2934 # instead. 

2935 password = b_((password + str_(_encryption_padding))[:32]) 

2936 # 2. Initialize the MD5 hash function and pass the result of step 1 as 

2937 # input to this function. 

2938 m = md5(password) 

2939 # 3. (Revision 3 or greater) Do the following 50 times: Take the output 

2940 # from the previous MD5 hash and pass it as input into a new MD5 hash. 

2941 md5_hash = m.digest() 

2942 if rev >= 3: 

2943 for i in range(50): 

2944 md5_hash = md5(md5_hash).digest() 

2945 # 4. Create an RC4 encryption key using the first n bytes of the output 

2946 # from the final MD5 hash, where n is always 5 for revision 2 but, for 

2947 # revision 3 or greater, depends on the value of the encryption 

2948 # dictionary's /Length entry. 

2949 key = md5_hash[:keylen] 

2950 return key 

2951 

2952 

2953# Implementation of algorithm 3.4 of the PDF standard security handler, 

2954# section 3.5.2 of the PDF 1.6 reference. 

2955def _alg34(password, owner_entry, p_entry, id1_entry): 

2956 # 1. Create an encryption key based on the user password string, as 

2957 # described in algorithm 3.2. 

2958 key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) 

2959 # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, 

2960 # using an RC4 encryption function with the encryption key from the 

2961 # preceding step. 

2962 U = utils.RC4_encrypt(key, _encryption_padding) 

2963 # 3. Store the result of step 2 as the value of the /U entry in the 

2964 # encryption dictionary. 

2965 return U, key 

2966 

2967 

2968# Implementation of algorithm 3.4 of the PDF standard security handler, 

2969# section 3.5.2 of the PDF 1.6 reference. 

2970def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): 

2971 # 1. Create an encryption key based on the user password string, as 

2972 # described in Algorithm 3.2. 

2973 key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) 

2974 # 2. Initialize the MD5 hash function and pass the 32-byte padding string 

2975 # shown in step 1 of Algorithm 3.2 as input to this function. 

2976 m = md5() 

2977 m.update(_encryption_padding) 

2978 # 3. Pass the first element of the file's file identifier array (the value 

2979 # of the ID entry in the document's trailer dictionary; see Table 3.13 on 

2980 # page 73) to the hash function and finish the hash. (See implementation 

2981 # note 25 in Appendix H.) 

2982 m.update(id1_entry.original_bytes) 

2983 md5_hash = m.digest() 

2984 # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption 

2985 # function with the encryption key from step 1. 

2986 val = utils.RC4_encrypt(key, md5_hash) 

2987 # 5. Do the following 19 times: Take the output from the previous 

2988 # invocation of the RC4 function and pass it as input to a new invocation 

2989 # of the function; use an encryption key generated by taking each byte of 

2990 # the original encryption key (obtained in step 2) and performing an XOR 

2991 # operation between that byte and the single-byte value of the iteration 

2992 # counter (from 1 to 19). 

2993 for i in range(1, 20): 

2994 new_key = b_('') 

2995 for l in range(len(key)): 

2996 new_key += b_(chr(ord_(key[l]) ^ i)) 

2997 val = utils.RC4_encrypt(new_key, val) 

2998 # 6. Append 16 bytes of arbitrary padding to the output from the final 

2999 # invocation of the RC4 function and store the 32-byte result as the value 

3000 # of the U entry in the encryption dictionary. 

3001 # (implementator note: I don't know what "arbitrary padding" is supposed to 

3002 # mean, so I have used null bytes. This seems to match a few other 

3003 # people's implementations) 

3004 return val + (b_('\x00') * 16), key