<< Masa.20101129-install-win7-update-bbmb | 2010 | Masa.20101125-check-object_stream-structure >>
lib/rpdf2txt/object.rb#decode_raw_stream
def decode_raw_stream
@decrypted_stream = raw_stream
unless(@decoder.nil?)
print "@decoder="
p @decoder
@decrypted_stream = @decoder.decrypt(self)
end
Point
grep search
masa@masa ~/ywesee/rpdf2txt $ grep -r decoder * lib/rpdf2txt/object.rb: attr_accessor :decoder, :src, :oid lib/rpdf2txt/object.rb: unless(@decoder.nil?) lib/rpdf2txt/object.rb: @decrypted_stream = @decoder.decrypt(self) lib/rpdf2txt/parser.rb: obj.decoder = @encrypt
Note
lib/rpdf2txt/parser.rb#build_trailer_dictionary
def build_trailer_dictionary
@trailer_dictionary = @object_catalogue.values.find do |obj|
obj.is_a?(TrailerDictionary)
end
startobj = 0
endobj = 0
while(endobj && (startobj = @src.index(/\btrailer/n, endobj)))
if(endobj = @src.index(/startxref/n, startobj))
endobj+= 8
trailer_src = @src[startobj..endobj]
trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding)
if(@trailer_dictionary.nil?)
@trailer_dictionary = trailer_dictionary
else
@trailer_dictionary.update(trailer_dictionary)
end
end
end
if @trailer_dictionary.nil? \
&& match = /startxref\s*(\d+)\s*%%EOF/m.match(@src)
startobj = match[1].to_i
endobj = @src.index(/endobj/n, startobj) + 6
xref_src = @src[startobj...endobj]
@trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding)
end
if (@encrypt_id = @trailer_dictionary.encrypt_id) \
&& (obj = @object_catalogue[@encrypt_id])
@encrypt = PdfEncrypt.new(obj.src)
@encrypt.file_id = @trailer_dictionary.file_id
@object_catalogue.each_value do |obj|
obj.decoder = @encrypt
end
end
@trailer_dictionary
end
Note
Check the actual values above
lib/rpdf2txt/parser.rb#build_trailer_dictionary
if (@encrypt_id = @trailer_dictionary.encrypt_id) \
&& (obj = @object_catalogue[@encrypt_id])
print "@encrypt_id="
p @encrypt_id
print "@obuject_catalogue[@encrypt_id]="
p obj
exit
@encrypt = PdfEncrypt.new(obj.src)
@encrypt.file_id = @trailer_dictionary.file_id
@object_catalogue.each_value do |obj|
obj.decoder = @encrypt
end
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf
@encrypt_id=1898
@obuject_catalogue[@encrypt_id]=#<Rpdf2txt::PdfHash:0x7f802996c9d0 @oid=1898, @target_encoding="utf8", @attributes={},
@src="1898 0 obj<</Length 128/Filter/Standard/O(\314\\r\361\201\221i8\315h\340}\021\027\275\3123\006n\273Q>\312\366\002\246d\330b\204\243\374\275)
/P -1044/R 3/U(\006*\335G\320\345p\316w\326\277k\001\036\312\035\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000)/V 2>>\rendobj",
@contents={:r=>"3", :p=>"-1044", :u=>"\006*\335G\320\345p\316w\326\277k\001\036\312\035\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000",
:filter=>"/Standard", :o=>"\314\r\361\201\221i8\315h\340}\021\027\275\3123\006n\273Q>\312\366\002\246d\330b\204\243\374\275", :length=>"128", :v=>"2"}>
Check actual value in v14.pdf
masa@masa ~/ywesee/rpdf2txt $ vim v14.pdf ... trailer^M <</Size 1911/Prev 1153188/Root 1899 0 R/Encrypt 1898 0 R/Info 1896 0 R/ID[<E3EECABE4AA364299244B16AAA72C341><6188A7A747FC8E478C2C12B70C88B92E>]>>^M startxref^M ... 1898 0 obj<</Length 128/Filter/Standard/O(Ì\rñ<81><91>i8Íhà}^Q^W½Ê3^Fn»Q>Êö^B¦dØb<84>£ü½)/P -1044/R 3/U(^F*ÝGÐåpÎwÖ¿k^A^^Ê^]^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@)/V 2>>^Mendobj ... trailer^M <</Size 1897/Encrypt 1898 0 R>>^M startxref^M
Note
Check v16.pdf
...
2544 0 obj^M<</Filter/Standard/Length 128/O(Ì\rñ<81><91>i8Íhà}^Q^W½Ê3^Fn»Q>Êö^B¦dØb<84>£ü½)/P -1044/R 3/U(^¶/%:1U^H;{C·_æh<98>^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@)/V 2>>^Mendobj^M3166 0 obj^M<</DecodeParms<</Columns 5/Predictor 12>>/Encrypt 2544 0 R/Filter/FlateDecode/ID[<D0EFE4F3CB13FE410B331D88732E558E><1102E91A759E90448EC2252012CE5B53>]/Info 2530 0 R/Length 7123/Root 2532 0 R/Size 3167/Type/XRef/W[1 3 1]>>stream
...
endstream^Mendobj^Mstartxref^M
1293921^M
%%EOF^M
Note
Next
Reference
p.91
A trailer giving the location of the cross-reference table and of certain special objects within the body of the file
p.96 3.4.4File Trailer
The trailer of a PDF file enables an application reading the file to quickly find the cross-reference table and certain special objects.
p.97
The startxref line is preceded by the trailer dictionary, consisting of the keyword trailer followed by a series of key-value pairs enclosed in double angle brackets (<< … >>)
Note
p.106 3.4.7Cross-Reference Streams
Cross-reference streams are stream objects (see Section 3.2.7, “Stream Objects”), and contain a dictionary and a data stream. Each cross-reference stream contains the information equivalent to the cross-reference table (see Section 3.4.3, “Cross-Reference Table”) and trailer (see Section 3.4.4, “File Trailer”) for one cross-reference section.
Note
p.106
Note that the value following the startxref keyword is now the offset of the cross-reference stream rather than the xref keyword. For files that use cross-reference streams entirely, the keywords xref and trailer are no longer used. Therefore, with the exception of the startxref address %%EOF segment and comments, a PDF 1.5 file is entirely a sequence of objects.
Note
But
Consideration
So points
So, the dependencies are circulated!!
Experiment
lib/rpdf2txt/parser.rb#build_object_catalogue
def build_object_catalogue
startobj=0
endobj=0
catalogue = {}
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
obj = build_object(match.to_s)
catalogue.store(obj.oid, obj)
end
catalogue
end
lib/rpdf2txt/parser.rb#rebuild_object_catalogue
def rebuild_object_catalogue
object_catalogue.values.select do |obj|
obj.is_a?(ObjStream)
end.each do |obj|
scan_object_stream obj.decoded_stream, object_catalogue
end
end
lib/rpdf2txt/parser.rb#page_tree_root
def page_tree_root
catalogue = object_catalogue
trailer = trailer_dictionary
rebuild_object_catalogue
catalogue[trailer.root_id]
end
Note
This is the point!
Result
Commit
Run de.oddb.org/bin/oddbd
Run jobs/imoprt_gkv
Result
mhatakeyama@ywesee.com 詳細を表示 12:22 (55分前) Fri Nov 26 11:41:21 2010: de.oddb.org ODDB::Import::Gkv#import Imported 6567 Zubef-Entries on 26.11.2010: Visited 6510 existing Zubef-Entries Visited 6567 existing Companies Visited 1073 existing Substances Created 57 new Zubef-Entries Created 4 new Products Created 45 new Sequences Created 0 new Companies Created 0 new Substances Assigned 0 Chemical Equivalences Assigned 0 Companies Created 10 Incomplete Packages: http://de.oddb.org/de/drugs/package/pzn/5522832 http://de.oddb.org/de/drugs/package/pzn/7652361 http://de.oddb.org/de/drugs/package/pzn/7652378 http://de.oddb.org/de/drugs/package/pzn/7713364 http://de.oddb.org/de/drugs/package/pzn/7713370 http://de.oddb.org/de/drugs/package/pzn/7713387 http://de.oddb.org/de/drugs/package/pzn/7713393 http://de.oddb.org/de/drugs/package/pzn/7713507 http://de.oddb.org/de/drugs/package/pzn/7714085 http://de.oddb.org/de/drugs/package/pzn/5527404 Created 1 Product(s) without a name (missing product name): http://de.oddb.org/de/drugs/product/uid/3480899
Check the downloaded pdf file (version)
How to update a library on production server
lrwxrwxrwx 1 root root 38 2009-04-06 18:40 rpdf2txt -> /home/ywesee/git/rpdf2txt/lib/rpdf2txt so all I done is a git checkout -f in /home/ywesee/git/rpdf2txt
Check the current result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb
Loaded suite test/test_pdf_object
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
.ruby: symbol lookup error: /usr/lib64/ruby/gems/1.8/gems/rmagick-2.9.0/lib/RMagick2.so: undefined symbol: DestroyConstitute
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb
Loaded suite test/test_pdf_parser
Started
...F........F.....E...
Finished in 3.813254 seconds.
1) Failure:
test_encrypt(TestParser) [test/test_pdf_parser.rb:1319]:
<395> expected but was
<nil>.
2) Failure:
test_join_snippets__hex_chars(TestParser) [test/test_pdf_parser.rb:316]:
<"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n"> expected but was
<"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n">.
3) Error:
test_trailer_dictionary(TestParser):
NoMethodError: undefined method `values' for nil:NilClass
/usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary'
test/test_pdf_parser.rb:1292:in `test_trailer_dictionary'
22 tests, 59 assertions, 2 failures, 1 errors
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_text.rb
Loaded suite test/test_pdf_text
Started
................................
Finished in 1.170674 seconds.
32 tests, 41 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_space_bug_05_2004.rb
Loaded suite test/test_space_bug_05_2004
Started
.
Finished in 0.385029 seconds.
1 tests, 1 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_stream.rb
Loaded suite test/test_stream
Started
..........
Finished in 0.751271 seconds.
10 tests, 14 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_text_state.rb
Loaded suite test/test_text_state
Started
..............
Finished in 1.6615 seconds.
14 tests, 75 assertions, 0 failures, 0 errors
Note
pass
pass
pass
Memo
def test_encrypt
input ='
SDSdASDASd
trailer
<<
/Size 476
/Info 388 0 R
/Encrypt 395 0 R
/Root 394 0 R
/Prev 203754
/ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>]
>>
startxref
adfadfadf
trailer
<<
/Size 500
/ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>]
>>
startxref'
@parser.src = input
@parser.object_catalogue
@parser.trailer_dictionary
assert_equal(395, @parser.encrypt_id)
assert_equal("500", @parser.trailer_dictionary.attributes[:size])
end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb
Loaded suite test/test_pdf_parser
Started
............F.....E...
Finished in 3.712914 seconds.
1) Failure:
test_join_snippets__hex_chars(TestParser) [test/test_pdf_parser.rb:316]:
<"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n"> expected but was
<"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n">.
2) Error:
test_trailer_dictionary(TestParser):
NoMethodError: undefined method `values' for nil:NilClass
/usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary'
test/test_pdf_parser.rb:1292:in `test_trailer_dictionary'
22 tests, 60 assertions, 1 failures, 1 errors
Add $KCODE
$KCODE = "UTF8"
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb
Loaded suite test/test_pdf_parser
Started
..................E...
Finished in 3.741281 seconds.
1) Error:
test_trailer_dictionary(TestParser):
NoMethodError: undefined method `values' for nil:NilClass
/usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary'
test/test_pdf_parser.rb:1294:in `test_trailer_dictionary'
22 tests, 60 assertions, 0 failures, 1 errors
def test_trailer_dictionary
input ='
SDSdASDASd
trailer
<<
/Size 476
/Info 388 0 R
/Encrypt 395 0 R
/Root 394 0 R
/Prev 203754
/ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>]
>>
startxref
adfadfadf
trailer
<<
/Size 500
/ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>]
>>
startxref'
@parser.src = input
@parser.object_catalogue
@parser.build_trailer_dictionary
assert_equal("500", @parser.trailer_dictionary.attributes[:size])
assert_equal("388 0 R", @parser.trailer_dictionary.attributes[:info])
assert_equal(395, @parser.trailer_dictionary.encrypt_id)
end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ...................... Finished in 3.735419 seconds. 22 tests, 63 assertions, 0 failures, 0 errors
Problem
suspend
Reinstall rmagick
masa@masa ~/ywesee/rpdf2txt $ sudo gem uninstall rmagick masa@masa ~/ywesee/rpdf2txt $ sudo gem install rmagick
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb
Loaded suite test/test_pdf_object
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
.......E.....E................unknown encoding 370 0 R
...
Finished in 5.116151 seconds.
1) Error:
test_inline_img(Rpdf2txt::TestInlineImage):
TypeError: can't convert nil into String
test/test_pdf_object.rb:1797:in `exist?'
test/test_pdf_object.rb:1797:in `test_inline_img'
2) Error:
test_text_space_bug2(Rpdf2txt::TestPageLeaf):
Errno::ENOENT: No such file or directory - /home/masa/ywesee/rpdf2txt/test/data/space_bug_stream2.txt
test/test_pdf_object.rb:1665:in `read'
test/test_pdf_object.rb:1665:in `test_text_space_bug2'
55 tests, 94 assertions, 0 failures, 2 errors
Notes
Comment out
=begin
def test_text_space_bug2
stream = Stream.new
path = File.expand_path('data/space_bug_stream2.txt',
File.dirname(__FILE__))
fontsrc15 = <<-EOS
327 0 obj^M<</Subtype/Type1/FontDescriptor 325 0 R/LastChar 252/Widths[278 389 0 0 0 1000 722 278 333 333 556 600 278 333 278 278 556 556 556 556 556 556 556 556 556 556 278 278 0 600 0 0 0 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778 556 778 611 500 556 722 667 1000 667 667 556 0 0 0 0 500 0 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611 611 611 389 389 389 611 500 833 500 500 500 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 278 556 0 0 500 0 0 0 0 0 944 0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 400 0 0 0 0 611 0 0 0 0 0 556 0 0 0 0 0 0 0 0 722 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 778 0 0 0 0 0 722 0 0 0 556 0 556 0 556 0 0 444 556 556 556 0 0 0 278 278 0 0 0 0 611 0 611 0 0 611 0 611 611]/BaseFont/PLWAZC+Frutiger-Roman/FirstChar 32/Encoding/WinAnsiEncoding/Type/Font>>^Mendobj
EOS
font15 = Font.new(fontsrc15)
stream.decoded_stream = File.read path
page = PageLeaf.new
page.resources = resource = Resource.new
resource.instance_variable_get('@fonts').store(:r15, font15)
handler = SimpleHandler.new
page.contents = [stream]
page.text(handler)
expected = "Inhalt / Table des mati\303\250res"
assert_equal(expected.strip, handler.out.strip[0,28])
expected = '10 mg, 20 mg und 40 mg'
assert_equal(expected.strip, handler.out.strip[346,22])
end
=end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb
Loaded suite test/test_pdf_object
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
.......E.....................unknown encoding 370 0 R
...
Finished in 4.981177 seconds.
1) Error:
test_inline_img(Rpdf2txt::TestInlineImage):
TypeError: can't convert nil into String
test/test_pdf_object.rb:1797:in `exist?'
test/test_pdf_object.rb:1797:in `test_inline_img'
54 tests, 94 assertions, 0 failures, 1 errors
class TestInlineImage < Test::Unit::TestCase
def test_inline_img
attrs = <<-EOS
/W 113
/CS /DeviceGray
/BPC 8
/DP << /Predictor 15
/Columns 113
>>
/F /Fl
/H 1
EOS
data = "x\234cd\2407\000\000\000\344\000\002"
obj = InlineImage.new(attrs, data)
assert_nothing_raised { obj.image }
path = File.expand_path('data/inline.png', File.dirname(__FILE__))
good = Magick::Image.read path
tmp_path = Tempfile.new('test').path + '.png'
obj.image.write tmp_path
tmp = Magick::Image.read tmp_path
assert_equal(good, tmp)
rescue StandardError => e
p e
ensure
File.delete tmp_path if File.exist? tmp_path
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 5.051024 seconds. 54 tests, 95 assertions, 0 failures, 0 errors
Note
suit.rb (all tests)
masa@masa ~/ywesee/rpdf2txt $ ruby test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R ............................................ Finished in 12.029007 seconds. 133 tests, 289 assertions, 0 failures, 0 errors
Next
add method test/test_pdf_parser.rb
def test_rebuild_object_catalogue
file = File.expand_path('./data/encrypted_object_stream.pdf', File.dirname(__FILE__))
input = File.read(file)
parser = Rpdf2txt::Parser.new(input)
cat = parser.object_catalogue
assert_equal(3, cat.length)
assert_equal(cat[2545].class, Rpdf2txt::ObjStream)
assert_equal(cat[3166].class, Rpdf2txt::TrailerDictionary)
assert_equal(cat[2544].class, Rpdf2txt::PdfHash)
parser.trailer_dictionary
parser.rebuild_object_catalogue
assert_equal(4, cat.length)
assert_equal(cat[2530].class, Rpdf2txt::PdfHash)
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ....................... Finished in 3.785444 seconds. 23 tests, 69 assertions, 0 failures, 0 errors
Commit
cdrom::19:haldaemon,masa