<< Masa.20101125-check-object_stream-structure | 2010 | Masa.20101122-testcase-grant_download-command >>
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf
'incorrect header check' when filtering with /FlateDecode
./lib/rpdf2txt/parser.rb:146:in `scan_object_stream': undefined method `[]' for nil:NilClass (NoMethodError)
from ./lib/rpdf2txt/parser.rb:137:in `build_object_catalogue'
from ./lib/rpdf2txt/parser.rb:134:in `each'
from ./lib/rpdf2txt/parser.rb:134:in `build_object_catalogue'
from ./lib/rpdf2txt/parser.rb:47:in `object_catalogue'
from ./lib/rpdf2txt/parser.rb:160:in `page_tree_root'
from ./lib/rpdf2txt/parser.rb:142:in `build_page_tree'
from ./lib/rpdf2txt/parser.rb:50:in `page_tree'
from ./lib/rpdf2txt/parser.rb:40:in `extract_text'
from bin/rpdf2txt:58
Experiment
lib/rpdf2txt/parser.rb#build_object, #build_object_catalogue
def build_object(src)
case src
when /\/Type\s*\/Catalog\b/n
CatalogNode.new(src, @target_encoding)
when /\/Type\s*\/Pages\b/n
PageNode.new(src, @target_encoding)
when /\/Type\s*\/Page\b/n
PageLeaf.new(src, @target_encoding)
when /\/Type\s*\/Font\b/n
Font.new(src, @target_encoding)
when /\/Type\s*\/FontDescriptor\b/n
FontDescriptor.new(src, @target_encoding)
when /\/Type\s*\/Encoding\b/n
Encoding.new(src, @target_encoding)
when /\/Type\s*\/ObjStm\b/n
p "getin ObjStm"
ObjStream.new(src, @target_encoding)
when /\/Type\s*\/XRef\b/n
TrailerDictionary.new(src, @target_encoding)
when %r!/Subtype\s*/Image!n
Image.new(src, @target_encoding)
when /\bstream\b/n, %r{/ToUnicode\b}n
Stream.new(src, @target_encoding)
when /\/Font\s*<</mn
Resource.new(src, @target_encoding)
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn
ReferenceArray.new(src, @target_encoding)
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn
PdfArray.new(src, @target_encoding)
when /obj\s*<</mn
PdfHash.new(src, @target_encoding)
else
Unknown.new(src, @target_encoding)
end
end
...
def build_object_catalogue
startobj=0
endobj=0
catalogue = {}
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
obj = build_object(match.to_s)
catalogue.store(obj.oid, obj)
end
catalogue.values.select do |obj|
obj.is_a?(ObjStream)
end.each do |obj|
scan_object_stream obj.decoded_stream, catalogue
end
catalogue
p "endof build_object_catalogue"
exit
Run
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf
"endof build_object_catalogue"
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf
"getin ObjStm"
'incorrect header check' when filtering with /FlateDecode
./lib/rpdf2txt/parser.rb:150:in `scan_object_stream': undefined method `[]' for nil:NilClass (NoMethodError)
from ./lib/rpdf2txt/parser.rb:138:in `build_object_catalogue'
from ./lib/rpdf2txt/parser.rb:135:in `each'
from ./lib/rpdf2txt/parser.rb:135:in `build_object_catalogue'
from ./lib/rpdf2txt/parser.rb:47:in `object_catalogue'
from ./lib/rpdf2txt/parser.rb:164:in `page_tree_root'
from ./lib/rpdf2txt/parser.rb:145:in `build_page_tree'
from ./lib/rpdf2txt/parser.rb:50:in `page_tree'
from ./lib/rpdf2txt/parser.rb:40:in `extract_text'
from bin/rpdf2txt:58
Notes
Experiment
def build_object(src)
case src
when /\/Type\s*\/Catalog\b/n
@ob["CatalogNode"]+=1 if @ob["CatalogNode"]||=0
CatalogNode.new(src, @target_encoding)
when /\/Type\s*\/Pages\b/n
@ob["PageNode"]+=1 if @ob["PageNode"]||=0
PageNode.new(src, @target_encoding)
when /\/Type\s*\/Page\b/n
@ob["PageLeaf"]+=1 if @ob["PageLeaf"]||=0
PageLeaf.new(src, @target_encoding)
when /\/Type\s*\/Font\b/n
@ob["Font"]+=1 if @ob["Font"]||=0
Font.new(src, @target_encoding)
when /\/Type\s*\/FontDescriptor\b/n
@ob["FontDescriptor"]+=1 if @ob["FontDescriptor"]||=0
FontDescriptor.new(src, @target_encoding)
when /\/Type\s*\/Encoding\b/n
@ob["Encoding"]+=1 if @ob["Encoding"]||=0
Encoding.new(src, @target_encoding)
when /\/Type\s*\/ObjStm\b/n
@ob["ObjStream"]+=1 if @ob["ObjStream"]||=0
ObjStream.new(src, @target_encoding)
when /\/Type\s*\/XRef\b/n
@ob["TrailerDictionary"]+=1 if @ob["TrailerDictionary"]||=0
TrailerDictionary.new(src, @target_encoding)
when %r!/Subtype\s*/Image!n
@ob["Image"]+=1 if @ob["Image"]||=0
Image.new(src, @target_encoding)
when /\bstream\b/n, %r{/ToUnicode\b}n
@ob["Stream"]+=1 if @ob["Stream"]||=0
Stream.new(src, @target_encoding)
when /\/Font\s*<</mn
@ob["Resource"]+=1 if @ob["Resource"]||=0
Resource.new(src, @target_encoding)
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn
@ob["ReferenceArray"]+=1 if @ob["ReferenceArray"]||=0
ReferenceArray.new(src, @target_encoding)
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn
@ob["PdfArray"]+=1 if @ob["PdfArray"]||=0
PdfArray.new(src, @target_encoding)
when /obj\s*<</mn
@ob["PdfHash"]+=1 if @ob["PdfHash"]||=0
PdfHash.new(src, @target_encoding)
else
@ob["Unknown"]+=1 if @ob["Unknown"]||=0
Unknown.new(src, @target_encoding)
end
end
...
def build_object_catalogue
startobj=0
endobj=0
catalogue = {}
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
obj = build_object(match.to_s)
catalogue.store(obj.oid, obj)
end
@ob.keys.sort.each do |k|
print k, ":\t", @ob[k], "\n"
end
p "endof build_objects"
exit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf CatalogNode: 1 Font: 2 FontDescriptor: 2 PageLeaf: 609 PageNode: 68 PdfHash: 6 Resource: 609 Stream: 612 Unknown: 1 "endof build_objects" masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf CatalogNode: 1 ObjStream: 621 PdfHash: 1 Stream: 614 TrailerDictionary: 1 "endof build_objects"
Note
Check pdf with vi
%PDF-1.6 %<e2><e3><cf><d3> ^M2545 0 obj <</Filter/FlateDecode/First 7/Length 272/N 1/Type/ObjStm>>stream ^M@<a2>y<ba>zk#<b3>^P<81><dd>^]<ef>2<e2>a<84><c0>X<1f>y<ec><a2>k<e4>8<b6>>x<91><ac>Shv<97><8a><b7><94><b4>F<a5><a5>E<d5><de>ud<dd>^R<a9>R<8b>G<e8>k<d8>O<ef>8<d9><e5><92><e6><e9><a9>k<d4> ܺ"<da>#^V<cc>M<bc><a0><b6><fd><9a>D<b9><ef><a6>d^YcL5r-<a5>^A<e4>a<95><d2><d1>FF<d5><d8>^U<ce>˗%-<b9>J^^v~<d0>g<e5><cd><e0>bI5XJ<a2>^?N <a9><bd><f3>^L<e1>r~a<87><f2>D<e2><ed><ae>:<dd>ίN.<e4><9a><c5>^X4^?=r<95><f5><e4>J<c3><db><f5><c3> <81><f7>m^R<d8>^_^N<fc><85>˓^C^S<fa><9d><98><dc>N*<c4><ee><8c>2<fb><ad>^B|<e3>Ba^Wt<da>V<dd>$<ff>^Sqfm?<9b><d1><ea>T<e7>t<b9><ca>'<f7><d2>?<ba><b3>?^F<ca>h*<90><9c>4<87>. j<8b><95><c4>!<dc>^Y<98><92>Y<92>I<dc>^@|u<c8>z4 <d4><c9>C^E^B2<99>%<e9><e8>^\<d7>fB<f4>^RT ^Mendstream endobj ...
Notes
Referece
What to do?
Reference
Notes
The purpose of object streams is to allow a greater numberof PDF objects to be compressed, thereby allowing a substantial reduction in the size of PDF files.
Check cat v16.pdf
masa@masa ~/ywesee/rpdf2txt $ cat v16.pdf % F-1.6 <</Filter/FlateDecode/First 7/Length 272/N 1/Type/ObjStm>>stream ... <</Filter/FlateDecode/First 688/Length 2117/N 70/Type/ObjStm>>stream ...
Notes
Experiment (Zlib test)
masa@masa ~/work $ cat test.rb
require 'zlib'
data = File.read("test.gz")
buf = Zlib::Inflate.new(Zlib::MAX_WBITS + 32).inflate(data)
print buf
masa@masa ~/work $ cp test.rb test
masa@masa ~/work $ gzip test
masa@masa ~/work $ ruby test.rb
require 'zlib'
data = File.read("test.gz")
buf = Zlib::Inflate.new(Zlib::MAX_WBITS + 32).inflate(data)
print buf
Note
Experiment (Compressed pdf data)
1. make a compressed pdf file (version 1.6)
begin
require 'pdf/writer'
rescue LoadError => le
if le.message =~ %r{pdf/writer$}
$LOAD_PATH.unshift("../lib")
require 'pdf/writer'
else
raise
end
end
pdf = PDF::Writer.new(:version => '1.6')
pdf.compressed=true
pdf.select_font "Times-Roman"
pdf.text "Hello, Ruby.", :font_size => 72, :justification => :center
pdf.save_as("hello.pdf")
Notes
2. Check the pdf with vim
%PDF-1.6
%âãÃ<8f>Ã<93>
1 0 obj
<< /Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
/Version /1.6>>
endobj
2 0 obj
<< /Type /Outlines >>
endobj
3 0 obj
<< /Type /Pages
/Kids [6 0 R
]
/Count 1
/Resources <<
/ProcSet 4 0 R
/Font <<
/F1 8 0 R >>
>>
/MediaBox [0 0 612.0 792.0]
>>
endobj
4 0 obj
[/PDF /Text ]
endobj
5 0 obj
<<
/CreationDate (D:201011241003)
/Creator (hello.rb)
/Producer (PDF::Writer for Ruby)
>>
endobj
6 0 obj
<< /Type /Page
/Parent 3 0 R
/Contents 7 0 R
>>
endobj
7 0 obj
<< /Filter /FlateDecode
/Length 82 >>
stream
x<9c>ã2Ð300P@&<8b>Ò¹0^E<83>ܹ<9c>B^T^L<8d>^Lô,-Í^TÌÌMõÌL,^TBR^TôÝ^L^UÌ<8d>ô^L^TBÒ^T<80>D<91><82><86>GjNN¾<8e>BPiR¥<9e>¦BH<96><82>k^H^@ h^VC
endstream
endobj
8 0 obj
<< /Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Times-Roman
/Encoding /WinAnsiEncoding
>>
endobj
xref
0 9
0000000000 65535 f
0000000019 00000 n
0000000097 00000 n
0000000134 00000 n
0000000276 00000 n
0000000305 00000 n
0000000410 00000 n
0000000473 00000 n
0000000627 00000 n
trailer
<< /Size 9
/Root 1 0 R
/Info 5 0 R
>>
startxref
736
%%EOF
3. Check the objects
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib/ bin/rpdf2txt hello.pdf Number of Objects=8 CatalogNode: 1 Font: 1 PageLeaf: 1 PageNode: 1 PdfHash: 2 Stream: 1 Unknown: 1 "endof build_objects" ObjStream.length=0 "endof build_object_catalogue"
Notes
4. Decompression test by rpdf2txt
masa@masa ~/ywesee/rpdf2txt $ rpdf2txt hello.pdf Hello, Ruby.
Note
Question
Hypothesis
References
Make a pdf with compressed stream object
begin
require 'pdf/writer'
rescue LoadError => le
if le.message =~ %r{pdf/writer$}
$LOAD_PATH.unshift("../lib")
require 'pdf/writer'
else
raise
end
end
pdf = PDF::Writer.new(:version => '1.6')
pdf.compressed=true
pdf.select_font "Times-Roman"
pdf.text "Hello, Ruby.", :font_size => 72, :justification => :center
pdf.save_as("hello.pdf")
Run
masa@masa ~/work/pdf_writer-1.1.8/demo $ ruby hello.rb
Result
masa@masa ~/work/pdf_writer-1.1.8/demo $ cat hello.pdf
....
7 0 obj
<< /Filter /FlateDecode
/Length 82 >>
stream
x���00P@&�ҹ0�ܹ�B
�
�-��M�L,BR�
̍�B��D���GjNN��BPiR���BH��k hC
endstream
endobj
...
Save a stream object
lib/rpdf2txt/object.rb#raw_stream
def raw_stream
print "@raw_strem="
p @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
open("test.gz", "wb") do |dst_file|
dst_file.write(@src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s)
end
Run
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt hello.gz.pdf
deflate.rb
require 'zlib' print Zlib::Inflate.inflate(File.read(ARGV[0]))
Run
masa@masa ~/work $ ruby deflate.rb test.gz 0.000 0.000 0.000 rg 0.000 0.000 0.000 RG BT 120.996 675.648 Td /F1 72.0 Tf 0 Tr (Hello, Ruby.) Tj ET
Note
Extract 'Object Stream' object from Gkv pdf (version 1.6)
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v16.pdf
Check deflate.rb
masa@masa ~/work $ ruby deflate.rb test.gz
deflate.rb:5:in `inflate': incorrect header check (Zlib::DataError)
from deflate.rb:5
Check bit data of the stream object of hello.gz.pdf (created by pdf-writer)
masa@masa ~/work $ xxd -b hello.gz 0000000: 01111000 10011100 11100011 00110010 11010000 00110011 x..2.3 0000006: 00110000 00110000 01010000 01000000 00100110 10001011 00P@&. 000000c: 11010010 10111001 00110000 00000101 10000011 11011100 ..0... 0000012: 10111001 10011100 01000010 00010100 00001100 10001101 ..B... 0000018: 00001100 11110100 00101100 00101101 11001101 00010100 ..,-.. 000001e: 11001100 11001100 01001101 11110101 11001100 01001100 ..M..L 0000024: 00101100 00010100 01000010 01010010 00010100 11110100 ,.BR.. 000002a: 11011101 00001100 00010101 11001100 10001101 11110100 ...... 0000030: 00001100 00010100 01000010 11010010 00010100 10000000 ..B... 0000036: 01000100 10010001 10000010 10000110 01000111 01101010 D...Gj 000003c: 01001110 01001110 10111110 10001110 01000010 01010000 NN..BP 0000042: 01101001 01010010 10100101 10011110 10100110 01000010 iR...B 0000048: 01001000 10010110 10000010 01101011 00001000 00000000 H..k.. 000004e: 00001001 01101000 00010110 01000011 00001010 .h.C.
Another example (demo.gz) created by pdf-writer
masa@masa ~/work $ xxd -b demo.gz |more 0000000: 01111000 10011100 10011101 01011000 11001011 10001110 x..X.. 0000006: 11110101 00110100 00001100 11011110 10011111 10100111 .4.... 000000c: 11101000 00010010 00010110 00101101 10001001 01110011 ...-.s 0000012: 11011111 00100010 00000001 11111011 10011111 01111001 ."...y 0000018: 00000010 10000110 10001011 00000100 11111100 01000010 .....B ...
Notes
Check bit data of the stream object of v16.pdf (Gkv pdf)
masa@masa ~/work $ xxd -b v16.gz
0000000: 01110111 00101110 10110111 00010010 11001100 10101101 w.....
0000006: 01010001 11100111 11110000 00000111 01000010 00100000 Q...B
000000c: 01010000 00010001 11001011 10001010 01001001 01111010 P...Iz
0000012: 10111100 01010110 00111100 11001101 00001000 01111100 .V<..|
0000018: 11110100 00111011 01110101 10011110 01110111 00001001 .;u.w.
000001e: 10000011 10011000 10000110 11111110 01100110 01111111 ....f.
0000024: 01101011 01001111 00011001 01101011 00110100 01000000 kO.k4@
000002a: 11100110 11010110 01001011 10100010 01000110 00111010 ..K.F:
0000030: 11110111 11100100 00011100 10011111 10010101 11011100 ......
0000036: 01100110 01110010 01100100 10001010 01001000 00100001 frd.H!
000003c: 10101001 01100011 10000000 11111111 00111000 01101010 .c..8j
0000042: 01111011 01000011 01101111 00000110 01000001 01110000 {Co.Ap
0000048: 00010010 00101110 11100010 11010011 00111100 10100111 ....<.
000004e: 11101011 10111011 01111011 11111100 10110110 01100001 ..{..a
0000054: 10011101 01010010 10111000 00001101 00001010 .R...
Notes
Experiment (Rewrite the header by binary editor)
masa@masa ~/work $ xxd v16.gz
0000000: 772e b712 ccad 51e7 f007 4220 5011 cb8a w.....Q...B P...
0000010: 497a bc56 3ccd 087c f43b 759e 7709 8398 Iz.V<..|.;u.w...
0000020: 86fe 667f 6b4f 196b 3440 e6d6 4ba2 463a ..f.kO.k4@..K.F:
0000030: f7e4 1c9f 95dc 6672 648a 4821 a963 80ff ......frd.H!.c..
0000040: 386a 7b43 6f06 4170 122e e2d3 3ca7 ebbb 8j{Co.Ap....<...
0000050: 7bfc b661 9d52 b80d 0a {..a.R...
masa@masa ~/work $ bvi v16.gz
masa@masa ~/work $ xxd v16.gz
0000000: 789c b712 ccad 51e7 f007 4220 5011 cb8a x.....Q...B P...
0000010: 497a bc56 3ccd 087c f43b 759e 7709 8398 Iz.V<..|.;u.w...
0000020: 86fe 667f 6b4f 196b 3440 e6d6 4ba2 463a ..f.kO.k4@..K.F:
0000030: f7e4 1c9f 95dc 6672 648a 4821 a963 80ff ......frd.H!.c..
0000040: 386a 7b43 6f06 4170 122e e2d3 3ca7 ebbb 8j{Co.Ap....<...
0000050: 7bfc b661 9d52 b80d 0a {..a.R...
masa@masa ~/work $ ruby deflate.rb v16.gz
deflate.rb:5:in `inflate': invalid block type (Zlib::DataError)
from deflate.rb:5
Notes