<< Masa.20101125-check-object_stream-structure | 2010 | Masa.20101122-testcase-grant_download-command >>
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf 'incorrect header check' when filtering with /FlateDecode ./lib/rpdf2txt/parser.rb:146:in `scan_object_stream': undefined method `[]' for nil:NilClass (NoMethodError) from ./lib/rpdf2txt/parser.rb:137:in `build_object_catalogue' from ./lib/rpdf2txt/parser.rb:134:in `each' from ./lib/rpdf2txt/parser.rb:134:in `build_object_catalogue' from ./lib/rpdf2txt/parser.rb:47:in `object_catalogue' from ./lib/rpdf2txt/parser.rb:160:in `page_tree_root' from ./lib/rpdf2txt/parser.rb:142:in `build_page_tree' from ./lib/rpdf2txt/parser.rb:50:in `page_tree' from ./lib/rpdf2txt/parser.rb:40:in `extract_text' from bin/rpdf2txt:58
Experiment
lib/rpdf2txt/parser.rb#build_object, #build_object_catalogue
def build_object(src) case src when /\/Type\s*\/Catalog\b/n CatalogNode.new(src, @target_encoding) when /\/Type\s*\/Pages\b/n PageNode.new(src, @target_encoding) when /\/Type\s*\/Page\b/n PageLeaf.new(src, @target_encoding) when /\/Type\s*\/Font\b/n Font.new(src, @target_encoding) when /\/Type\s*\/FontDescriptor\b/n FontDescriptor.new(src, @target_encoding) when /\/Type\s*\/Encoding\b/n Encoding.new(src, @target_encoding) when /\/Type\s*\/ObjStm\b/n p "getin ObjStm" ObjStream.new(src, @target_encoding) when /\/Type\s*\/XRef\b/n TrailerDictionary.new(src, @target_encoding) when %r!/Subtype\s*/Image!n Image.new(src, @target_encoding) when /\bstream\b/n, %r{/ToUnicode\b}n Stream.new(src, @target_encoding) when /\/Font\s*<</mn Resource.new(src, @target_encoding) when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn ReferenceArray.new(src, @target_encoding) when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn PdfArray.new(src, @target_encoding) when /obj\s*<</mn PdfHash.new(src, @target_encoding) else Unknown.new(src, @target_encoding) end end ... def build_object_catalogue startobj=0 endobj=0 catalogue = {} @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end catalogue.values.select do |obj| obj.is_a?(ObjStream) end.each do |obj| scan_object_stream obj.decoded_stream, catalogue end catalogue p "endof build_object_catalogue" exit
Run
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf "endof build_object_catalogue" masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf "getin ObjStm" 'incorrect header check' when filtering with /FlateDecode ./lib/rpdf2txt/parser.rb:150:in `scan_object_stream': undefined method `[]' for nil:NilClass (NoMethodError) from ./lib/rpdf2txt/parser.rb:138:in `build_object_catalogue' from ./lib/rpdf2txt/parser.rb:135:in `each' from ./lib/rpdf2txt/parser.rb:135:in `build_object_catalogue' from ./lib/rpdf2txt/parser.rb:47:in `object_catalogue' from ./lib/rpdf2txt/parser.rb:164:in `page_tree_root' from ./lib/rpdf2txt/parser.rb:145:in `build_page_tree' from ./lib/rpdf2txt/parser.rb:50:in `page_tree' from ./lib/rpdf2txt/parser.rb:40:in `extract_text' from bin/rpdf2txt:58
Notes
Experiment
def build_object(src) case src when /\/Type\s*\/Catalog\b/n @ob["CatalogNode"]+=1 if @ob["CatalogNode"]||=0 CatalogNode.new(src, @target_encoding) when /\/Type\s*\/Pages\b/n @ob["PageNode"]+=1 if @ob["PageNode"]||=0 PageNode.new(src, @target_encoding) when /\/Type\s*\/Page\b/n @ob["PageLeaf"]+=1 if @ob["PageLeaf"]||=0 PageLeaf.new(src, @target_encoding) when /\/Type\s*\/Font\b/n @ob["Font"]+=1 if @ob["Font"]||=0 Font.new(src, @target_encoding) when /\/Type\s*\/FontDescriptor\b/n @ob["FontDescriptor"]+=1 if @ob["FontDescriptor"]||=0 FontDescriptor.new(src, @target_encoding) when /\/Type\s*\/Encoding\b/n @ob["Encoding"]+=1 if @ob["Encoding"]||=0 Encoding.new(src, @target_encoding) when /\/Type\s*\/ObjStm\b/n @ob["ObjStream"]+=1 if @ob["ObjStream"]||=0 ObjStream.new(src, @target_encoding) when /\/Type\s*\/XRef\b/n @ob["TrailerDictionary"]+=1 if @ob["TrailerDictionary"]||=0 TrailerDictionary.new(src, @target_encoding) when %r!/Subtype\s*/Image!n @ob["Image"]+=1 if @ob["Image"]||=0 Image.new(src, @target_encoding) when /\bstream\b/n, %r{/ToUnicode\b}n @ob["Stream"]+=1 if @ob["Stream"]||=0 Stream.new(src, @target_encoding) when /\/Font\s*<</mn @ob["Resource"]+=1 if @ob["Resource"]||=0 Resource.new(src, @target_encoding) when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn @ob["ReferenceArray"]+=1 if @ob["ReferenceArray"]||=0 ReferenceArray.new(src, @target_encoding) when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn @ob["PdfArray"]+=1 if @ob["PdfArray"]||=0 PdfArray.new(src, @target_encoding) when /obj\s*<</mn @ob["PdfHash"]+=1 if @ob["PdfHash"]||=0 PdfHash.new(src, @target_encoding) else @ob["Unknown"]+=1 if @ob["Unknown"]||=0 Unknown.new(src, @target_encoding) end end ... def build_object_catalogue startobj=0 endobj=0 catalogue = {} @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end @ob.keys.sort.each do |k| print k, ":\t", @ob[k], "\n" end p "endof build_objects" exit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf CatalogNode: 1 Font: 2 FontDescriptor: 2 PageLeaf: 609 PageNode: 68 PdfHash: 6 Resource: 609 Stream: 612 Unknown: 1 "endof build_objects" masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf CatalogNode: 1 ObjStream: 621 PdfHash: 1 Stream: 614 TrailerDictionary: 1 "endof build_objects"
Note
Check pdf with vi
%PDF-1.6 %<e2><e3><cf><d3> ^M2545 0 obj <</Filter/FlateDecode/First 7/Length 272/N 1/Type/ObjStm>>stream ^M@<a2>y<ba>zk#<b3>^P<81><dd>^]<ef>2<e2>a<84><c0>X<1f>y<ec><a2>k<e4>8<b6>>x<91><ac>Shv<97><8a><b7><94><b4>F<a5><a5>E<d5><de>ud<dd>^R<a9>R<8b>G<e8>k<d8>O<ef>8<d9><e5><92><e6><e9><a9>k<d4> ܺ"<da>#^V<cc>M<bc><a0><b6><fd><9a>D<b9><ef><a6>d^YcL5r-<a5>^A<e4>a<95><d2><d1>FF<d5><d8>^U<ce>˗%-<b9>J^^v~<d0>g<e5><cd><e0>bI5XJ<a2>^?N <a9><bd><f3>^L<e1>r~a<87><f2>D<e2><ed><ae>:<dd>ίN.<e4><9a><c5>^X4^?=r<95><f5><e4>J<c3><db><f5><c3> <81><f7>m^R<d8>^_^N<fc><85>˓^C^S<fa><9d><98><dc>N*<c4><ee><8c>2<fb><ad>^B|<e3>Ba^Wt<da>V<dd>$<ff>^Sqfm?<9b><d1><ea>T<e7>t<b9><ca>'<f7><d2>?<ba><b3>?^F<ca>h*<90><9c>4<87>. j<8b><95><c4>!<dc>^Y<98><92>Y<92>I<dc>^@|u<c8>z4 <d4><c9>C^E^B2<99>%<e9><e8>^\<d7>fB<f4>^RT ^Mendstream endobj ...
Notes
Referece
What to do?
Reference
Notes
The purpose of object streams is to allow a greater numberof PDF objects to be compressed, thereby allowing a substantial reduction in the size of PDF files.
Check cat v16.pdf
masa@masa ~/ywesee/rpdf2txt $ cat v16.pdf % F-1.6 <</Filter/FlateDecode/First 7/Length 272/N 1/Type/ObjStm>>stream ... <</Filter/FlateDecode/First 688/Length 2117/N 70/Type/ObjStm>>stream ...
Notes
Experiment (Zlib test)
masa@masa ~/work $ cat test.rb require 'zlib' data = File.read("test.gz") buf = Zlib::Inflate.new(Zlib::MAX_WBITS + 32).inflate(data) print buf masa@masa ~/work $ cp test.rb test masa@masa ~/work $ gzip test masa@masa ~/work $ ruby test.rb require 'zlib' data = File.read("test.gz") buf = Zlib::Inflate.new(Zlib::MAX_WBITS + 32).inflate(data) print buf
Note
Experiment (Compressed pdf data)
1. make a compressed pdf file (version 1.6)
begin require 'pdf/writer' rescue LoadError => le if le.message =~ %r{pdf/writer$} $LOAD_PATH.unshift("../lib") require 'pdf/writer' else raise end end pdf = PDF::Writer.new(:version => '1.6') pdf.compressed=true pdf.select_font "Times-Roman" pdf.text "Hello, Ruby.", :font_size => 72, :justification => :center pdf.save_as("hello.pdf")
Notes
2. Check the pdf with vim
%PDF-1.6 %âãÃ<8f>Ã<93> 1 0 obj << /Type /Catalog /Outlines 2 0 R /Pages 3 0 R /Version /1.6>> endobj 2 0 obj << /Type /Outlines >> endobj 3 0 obj << /Type /Pages /Kids [6 0 R ] /Count 1 /Resources << /ProcSet 4 0 R /Font << /F1 8 0 R >> >> /MediaBox [0 0 612.0 792.0] >> endobj 4 0 obj [/PDF /Text ] endobj 5 0 obj << /CreationDate (D:201011241003) /Creator (hello.rb) /Producer (PDF::Writer for Ruby) >> endobj 6 0 obj << /Type /Page /Parent 3 0 R /Contents 7 0 R >> endobj 7 0 obj << /Filter /FlateDecode /Length 82 >> stream x<9c>ã2Ð300P@&<8b>Ò¹0^E<83>ܹ<9c>B^T^L<8d>^Lô,-Í^TÌÌMõÌL,^TBR^TôÝ^L^UÌ<8d>ô^L^TBÒ^T<80>D<91><82><86>GjNN¾<8e>BPiR¥<9e>¦BH<96><82>k^H^@ h^VC endstream endobj 8 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj xref 0 9 0000000000 65535 f 0000000019 00000 n 0000000097 00000 n 0000000134 00000 n 0000000276 00000 n 0000000305 00000 n 0000000410 00000 n 0000000473 00000 n 0000000627 00000 n trailer << /Size 9 /Root 1 0 R /Info 5 0 R >> startxref 736 %%EOF
3. Check the objects
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib/ bin/rpdf2txt hello.pdf Number of Objects=8 CatalogNode: 1 Font: 1 PageLeaf: 1 PageNode: 1 PdfHash: 2 Stream: 1 Unknown: 1 "endof build_objects" ObjStream.length=0 "endof build_object_catalogue"
Notes
4. Decompression test by rpdf2txt
masa@masa ~/ywesee/rpdf2txt $ rpdf2txt hello.pdf Hello, Ruby.
Note
Question
Hypothesis
References
Make a pdf with compressed stream object
begin require 'pdf/writer' rescue LoadError => le if le.message =~ %r{pdf/writer$} $LOAD_PATH.unshift("../lib") require 'pdf/writer' else raise end end pdf = PDF::Writer.new(:version => '1.6') pdf.compressed=true pdf.select_font "Times-Roman" pdf.text "Hello, Ruby.", :font_size => 72, :justification => :center pdf.save_as("hello.pdf")
Run
masa@masa ~/work/pdf_writer-1.1.8/demo $ ruby hello.rb
Result
masa@masa ~/work/pdf_writer-1.1.8/demo $ cat hello.pdf .... 7 0 obj << /Filter /FlateDecode /Length 82 >> stream x���00P@&�ҹ0�ܹ�B � �-��M�L,BR� ̍�B��D���GjNN��BPiR���BH��k hC endstream endobj ...
Save a stream object
lib/rpdf2txt/object.rb#raw_stream
def raw_stream print "@raw_strem=" p @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s open("test.gz", "wb") do |dst_file| dst_file.write(@src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s) end
Run
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt hello.gz.pdf
deflate.rb
require 'zlib' print Zlib::Inflate.inflate(File.read(ARGV[0]))
Run
masa@masa ~/work $ ruby deflate.rb test.gz 0.000 0.000 0.000 rg 0.000 0.000 0.000 RG BT 120.996 675.648 Td /F1 72.0 Tf 0 Tr (Hello, Ruby.) Tj ET
Note
Extract 'Object Stream' object from Gkv pdf (version 1.6)
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf
Check deflate.rb
masa@masa ~/work $ ruby deflate.rb test.gz deflate.rb:5:in `inflate': incorrect header check (Zlib::DataError) from deflate.rb:5
Check bit data of the stream object of hello.gz.pdf (created by pdf-writer)
masa@masa ~/work $ xxd -b hello.gz 0000000: 01111000 10011100 11100011 00110010 11010000 00110011 x..2.3 0000006: 00110000 00110000 01010000 01000000 00100110 10001011 00P@&. 000000c: 11010010 10111001 00110000 00000101 10000011 11011100 ..0... 0000012: 10111001 10011100 01000010 00010100 00001100 10001101 ..B... 0000018: 00001100 11110100 00101100 00101101 11001101 00010100 ..,-.. 000001e: 11001100 11001100 01001101 11110101 11001100 01001100 ..M..L 0000024: 00101100 00010100 01000010 01010010 00010100 11110100 ,.BR.. 000002a: 11011101 00001100 00010101 11001100 10001101 11110100 ...... 0000030: 00001100 00010100 01000010 11010010 00010100 10000000 ..B... 0000036: 01000100 10010001 10000010 10000110 01000111 01101010 D...Gj 000003c: 01001110 01001110 10111110 10001110 01000010 01010000 NN..BP 0000042: 01101001 01010010 10100101 10011110 10100110 01000010 iR...B 0000048: 01001000 10010110 10000010 01101011 00001000 00000000 H..k.. 000004e: 00001001 01101000 00010110 01000011 00001010 .h.C.
Another example (demo.gz) created by pdf-writer
masa@masa ~/work $ xxd -b demo.gz |more 0000000: 01111000 10011100 10011101 01011000 11001011 10001110 x..X.. 0000006: 11110101 00110100 00001100 11011110 10011111 10100111 .4.... 000000c: 11101000 00010010 00010110 00101101 10001001 01110011 ...-.s 0000012: 11011111 00100010 00000001 11111011 10011111 01111001 ."...y 0000018: 00000010 10000110 10001011 00000100 11111100 01000010 .....B ...
Notes
Check bit data of the stream object of v16.pdf (Gkv pdf)
masa@masa ~/work $ xxd -b v16.gz 0000000: 01110111 00101110 10110111 00010010 11001100 10101101 w..... 0000006: 01010001 11100111 11110000 00000111 01000010 00100000 Q...B 000000c: 01010000 00010001 11001011 10001010 01001001 01111010 P...Iz 0000012: 10111100 01010110 00111100 11001101 00001000 01111100 .V<..| 0000018: 11110100 00111011 01110101 10011110 01110111 00001001 .;u.w. 000001e: 10000011 10011000 10000110 11111110 01100110 01111111 ....f. 0000024: 01101011 01001111 00011001 01101011 00110100 01000000 kO.k4@ 000002a: 11100110 11010110 01001011 10100010 01000110 00111010 ..K.F: 0000030: 11110111 11100100 00011100 10011111 10010101 11011100 ...... 0000036: 01100110 01110010 01100100 10001010 01001000 00100001 frd.H! 000003c: 10101001 01100011 10000000 11111111 00111000 01101010 .c..8j 0000042: 01111011 01000011 01101111 00000110 01000001 01110000 {Co.Ap 0000048: 00010010 00101110 11100010 11010011 00111100 10100111 ....<. 000004e: 11101011 10111011 01111011 11111100 10110110 01100001 ..{..a 0000054: 10011101 01010010 10111000 00001101 00001010 .R...
Notes
Experiment (Rewrite the header by binary editor)
masa@masa ~/work $ xxd v16.gz 0000000: 772e b712 ccad 51e7 f007 4220 5011 cb8a w.....Q...B P... 0000010: 497a bc56 3ccd 087c f43b 759e 7709 8398 Iz.V<..|.;u.w... 0000020: 86fe 667f 6b4f 196b 3440 e6d6 4ba2 463a ..f.kO.k4@..K.F: 0000030: f7e4 1c9f 95dc 6672 648a 4821 a963 80ff ......frd.H!.c.. 0000040: 386a 7b43 6f06 4170 122e e2d3 3ca7 ebbb 8j{Co.Ap....<... 0000050: 7bfc b661 9d52 b80d 0a {..a.R... masa@masa ~/work $ bvi v16.gz masa@masa ~/work $ xxd v16.gz 0000000: 789c b712 ccad 51e7 f007 4220 5011 cb8a x.....Q...B P... 0000010: 497a bc56 3ccd 087c f43b 759e 7709 8398 Iz.V<..|.;u.w... 0000020: 86fe 667f 6b4f 196b 3440 e6d6 4ba2 463a ..f.kO.k4@..K.F: 0000030: f7e4 1c9f 95dc 6672 648a 4821 a963 80ff ......frd.H!.c.. 0000040: 386a 7b43 6f06 4170 122e e2d3 3ca7 ebbb 8j{Co.Ap....<... 0000050: 7bfc b661 9d52 b80d 0a {..a.R... masa@masa ~/work $ ruby deflate.rb v16.gz deflate.rb:5:in `inflate': invalid block type (Zlib::DataError) from deflate.rb:5
Notes