<< Masa.20101129-install-win7-update-bbmb | 2010 | Masa.20101125-check-object_stream-structure >>
lib/rpdf2txt/object.rb#decode_raw_stream
def decode_raw_stream @decrypted_stream = raw_stream unless(@decoder.nil?) print "@decoder=" p @decoder @decrypted_stream = @decoder.decrypt(self) end
Point
grep search
masa@masa ~/ywesee/rpdf2txt $ grep -r decoder * lib/rpdf2txt/object.rb: attr_accessor :decoder, :src, :oid lib/rpdf2txt/object.rb: unless(@decoder.nil?) lib/rpdf2txt/object.rb: @decrypted_stream = @decoder.decrypt(self) lib/rpdf2txt/parser.rb: obj.decoder = @encrypt
Note
lib/rpdf2txt/parser.rb#build_trailer_dictionary
def build_trailer_dictionary @trailer_dictionary = @object_catalogue.values.find do |obj| obj.is_a?(TrailerDictionary) end startobj = 0 endobj = 0 while(endobj && (startobj = @src.index(/\btrailer/n, endobj))) if(endobj = @src.index(/startxref/n, startobj)) endobj+= 8 trailer_src = @src[startobj..endobj] trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding) if(@trailer_dictionary.nil?) @trailer_dictionary = trailer_dictionary else @trailer_dictionary.update(trailer_dictionary) end end end if @trailer_dictionary.nil? \ && match = /startxref\s*(\d+)\s*%%EOF/m.match(@src) startobj = match[1].to_i endobj = @src.index(/endobj/n, startobj) + 6 xref_src = @src[startobj...endobj] @trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding) end if (@encrypt_id = @trailer_dictionary.encrypt_id) \ && (obj = @object_catalogue[@encrypt_id]) @encrypt = PdfEncrypt.new(obj.src) @encrypt.file_id = @trailer_dictionary.file_id @object_catalogue.each_value do |obj| obj.decoder = @encrypt end end @trailer_dictionary end
Note
Check the actual values above
lib/rpdf2txt/parser.rb#build_trailer_dictionary
if (@encrypt_id = @trailer_dictionary.encrypt_id) \ && (obj = @object_catalogue[@encrypt_id]) print "@encrypt_id=" p @encrypt_id print "@obuject_catalogue[@encrypt_id]=" p obj exit @encrypt = PdfEncrypt.new(obj.src) @encrypt.file_id = @trailer_dictionary.file_id @object_catalogue.each_value do |obj| obj.decoder = @encrypt end end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby -I lib bin/rpdf2txt v14.pdf @encrypt_id=1898 @obuject_catalogue[@encrypt_id]=#<Rpdf2txt::PdfHash:0x7f802996c9d0 @oid=1898, @target_encoding="utf8", @attributes={}, @src="1898 0 obj<</Length 128/Filter/Standard/O(\314\\r\361\201\221i8\315h\340}\021\027\275\3123\006n\273Q>\312\366\002\246d\330b\204\243\374\275) /P -1044/R 3/U(\006*\335G\320\345p\316w\326\277k\001\036\312\035\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000)/V 2>>\rendobj", @contents={:r=>"3", :p=>"-1044", :u=>"\006*\335G\320\345p\316w\326\277k\001\036\312\035\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000", :filter=>"/Standard", :o=>"\314\r\361\201\221i8\315h\340}\021\027\275\3123\006n\273Q>\312\366\002\246d\330b\204\243\374\275", :length=>"128", :v=>"2"}>
Check actual value in v14.pdf
masa@masa ~/ywesee/rpdf2txt $ vim v14.pdf ... trailer^M <</Size 1911/Prev 1153188/Root 1899 0 R/Encrypt 1898 0 R/Info 1896 0 R/ID[<E3EECABE4AA364299244B16AAA72C341><6188A7A747FC8E478C2C12B70C88B92E>]>>^M startxref^M ... 1898 0 obj<</Length 128/Filter/Standard/O(Ì\rñ<81><91>i8Íhà}^Q^W½Ê3^Fn»Q>Êö^B¦dØb<84>£ü½)/P -1044/R 3/U(^F*ÝGÐåpÎwÖ¿k^A^^Ê^]^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@)/V 2>>^Mendobj ... trailer^M <</Size 1897/Encrypt 1898 0 R>>^M startxref^M
Note
Check v16.pdf
... 2544 0 obj^M<</Filter/Standard/Length 128/O(Ì\rñ<81><91>i8Íhà}^Q^W½Ê3^Fn»Q>Êö^B¦dØb<84>£ü½)/P -1044/R 3/U(^¶/%:1U^H;{C·_æh<98>^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@^@)/V 2>>^Mendobj^M3166 0 obj^M<</DecodeParms<</Columns 5/Predictor 12>>/Encrypt 2544 0 R/Filter/FlateDecode/ID[<D0EFE4F3CB13FE410B331D88732E558E><1102E91A759E90448EC2252012CE5B53>]/Info 2530 0 R/Length 7123/Root 2532 0 R/Size 3167/Type/XRef/W[1 3 1]>>stream ... endstream^Mendobj^Mstartxref^M 1293921^M %%EOF^M
Note
Next
Reference
p.91
A trailer giving the location of the cross-reference table and of certain special objects within the body of the file
p.96 3.4.4File Trailer
The trailer of a PDF file enables an application reading the file to quickly find the cross-reference table and certain special objects.
p.97
The startxref line is preceded by the trailer dictionary, consisting of the keyword trailer followed by a series of key-value pairs enclosed in double angle brackets (<< … >>)
Note
p.106 3.4.7Cross-Reference Streams
Cross-reference streams are stream objects (see Section 3.2.7, “Stream Objects”), and contain a dictionary and a data stream. Each cross-reference stream contains the information equivalent to the cross-reference table (see Section 3.4.3, “Cross-Reference Table”) and trailer (see Section 3.4.4, “File Trailer”) for one cross-reference section.
Note
p.106
Note that the value following the startxref keyword is now the offset of the cross-reference stream rather than the xref keyword. For files that use cross-reference streams entirely, the keywords xref and trailer are no longer used. Therefore, with the exception of the startxref address %%EOF segment and comments, a PDF 1.5 file is entirely a sequence of objects.
Note
But
Consideration
So points
So, the dependencies are circulated!!
Experiment
lib/rpdf2txt/parser.rb#build_object_catalogue
def build_object_catalogue startobj=0 endobj=0 catalogue = {} @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end catalogue end
lib/rpdf2txt/parser.rb#rebuild_object_catalogue
def rebuild_object_catalogue object_catalogue.values.select do |obj| obj.is_a?(ObjStream) end.each do |obj| scan_object_stream obj.decoded_stream, object_catalogue end end
lib/rpdf2txt/parser.rb#page_tree_root
def page_tree_root catalogue = object_catalogue trailer = trailer_dictionary rebuild_object_catalogue catalogue[trailer.root_id] end
Note
This is the point!
Result
Commit
Run de.oddb.org/bin/oddbd
Run jobs/imoprt_gkv
Result
mhatakeyama@ywesee.com 詳細を表示 12:22 (55分前) Fri Nov 26 11:41:21 2010: de.oddb.org ODDB::Import::Gkv#import Imported 6567 Zubef-Entries on 26.11.2010: Visited 6510 existing Zubef-Entries Visited 6567 existing Companies Visited 1073 existing Substances Created 57 new Zubef-Entries Created 4 new Products Created 45 new Sequences Created 0 new Companies Created 0 new Substances Assigned 0 Chemical Equivalences Assigned 0 Companies Created 10 Incomplete Packages: http://de.oddb.org/de/drugs/package/pzn/5522832 http://de.oddb.org/de/drugs/package/pzn/7652361 http://de.oddb.org/de/drugs/package/pzn/7652378 http://de.oddb.org/de/drugs/package/pzn/7713364 http://de.oddb.org/de/drugs/package/pzn/7713370 http://de.oddb.org/de/drugs/package/pzn/7713387 http://de.oddb.org/de/drugs/package/pzn/7713393 http://de.oddb.org/de/drugs/package/pzn/7713507 http://de.oddb.org/de/drugs/package/pzn/7714085 http://de.oddb.org/de/drugs/package/pzn/5527404 Created 1 Product(s) without a name (missing product name): http://de.oddb.org/de/drugs/product/uid/3480899
Check the downloaded pdf file (version)
How to update a library on production server
lrwxrwxrwx 1 root root 38 2009-04-06 18:40 rpdf2txt -> /home/ywesee/git/rpdf2txt/lib/rpdf2txt so all I done is a git checkout -f in /home/ywesee/git/rpdf2txt
Check the current result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .ruby: symbol lookup error: /usr/lib64/ruby/gems/1.8/gems/rmagick-2.9.0/lib/RMagick2.so: undefined symbol: DestroyConstitute masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ...F........F.....E... Finished in 3.813254 seconds. 1) Failure: test_encrypt(TestParser) [test/test_pdf_parser.rb:1319]: <395> expected but was <nil>. 2) Failure: test_join_snippets__hex_chars(TestParser) [test/test_pdf_parser.rb:316]: <"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n"> expected but was <"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n">. 3) Error: test_trailer_dictionary(TestParser): NoMethodError: undefined method `values' for nil:NilClass /usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary' test/test_pdf_parser.rb:1292:in `test_trailer_dictionary' 22 tests, 59 assertions, 2 failures, 1 errors masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_text.rb Loaded suite test/test_pdf_text Started ................................ Finished in 1.170674 seconds. 32 tests, 41 assertions, 0 failures, 0 errors masa@masa ~/ywesee/rpdf2txt $ ruby test/test_space_bug_05_2004.rb Loaded suite test/test_space_bug_05_2004 Started . Finished in 0.385029 seconds. 1 tests, 1 assertions, 0 failures, 0 errors masa@masa ~/ywesee/rpdf2txt $ ruby test/test_stream.rb Loaded suite test/test_stream Started .......... Finished in 0.751271 seconds. 10 tests, 14 assertions, 0 failures, 0 errors masa@masa ~/ywesee/rpdf2txt $ ruby test/test_text_state.rb Loaded suite test/test_text_state Started .............. Finished in 1.6615 seconds. 14 tests, 75 assertions, 0 failures, 0 errors
Note
pass
pass
pass
Memo
def test_encrypt input =' SDSdASDASd trailer << /Size 476 /Info 388 0 R /Encrypt 395 0 R /Root 394 0 R /Prev 203754 /ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>] >> startxref adfadfadf trailer << /Size 500 /ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>] >> startxref' @parser.src = input @parser.object_catalogue @parser.trailer_dictionary assert_equal(395, @parser.encrypt_id) assert_equal("500", @parser.trailer_dictionary.attributes[:size]) end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ............F.....E... Finished in 3.712914 seconds. 1) Failure: test_join_snippets__hex_chars(TestParser) [test/test_pdf_parser.rb:316]: <"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n"> expected but was <"Paroxetin besitzt eine selektive Wirkung; in-vitro Studien haben gezeigt, dass es, im Gegensatz zu\ntrizyklischen Antidepressiva, eine geringe Affinit\344t f\374r a1-, a2- und b-Adrenozeptoren sowie f\374r\nDopamin (D2)-, 5-HT1-artige, 5-HT2 und Histamin (H1)-Rezeptoren aufweist. Das Fehlen einer\n">. 2) Error: test_trailer_dictionary(TestParser): NoMethodError: undefined method `values' for nil:NilClass /usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary' test/test_pdf_parser.rb:1292:in `test_trailer_dictionary' 22 tests, 60 assertions, 1 failures, 1 errors
Add $KCODE
$KCODE = "UTF8"
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ..................E... Finished in 3.741281 seconds. 1) Error: test_trailer_dictionary(TestParser): NoMethodError: undefined method `values' for nil:NilClass /usr/lib64/ruby/site_ruby/1.8/rpdf2txt/parser.rb:54:in `build_trailer_dictionary' test/test_pdf_parser.rb:1294:in `test_trailer_dictionary' 22 tests, 60 assertions, 0 failures, 1 errors
def test_trailer_dictionary input =' SDSdASDASd trailer << /Size 476 /Info 388 0 R /Encrypt 395 0 R /Root 394 0 R /Prev 203754 /ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>] >> startxref adfadfadf trailer << /Size 500 /ID[<8664e6986751f2a49dccc9a4b40a4f18v><e720b2184372f5e3f4edd86673b81dfd>] >> startxref' @parser.src = input @parser.object_catalogue @parser.build_trailer_dictionary assert_equal("500", @parser.trailer_dictionary.attributes[:size]) assert_equal("388 0 R", @parser.trailer_dictionary.attributes[:info]) assert_equal(395, @parser.trailer_dictionary.encrypt_id) end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ...................... Finished in 3.735419 seconds. 22 tests, 63 assertions, 0 failures, 0 errors
Problem
suspend
Reinstall rmagick
masa@masa ~/ywesee/rpdf2txt $ sudo gem uninstall rmagick masa@masa ~/ywesee/rpdf2txt $ sudo gem install rmagick
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .......E.....E................unknown encoding 370 0 R ... Finished in 5.116151 seconds. 1) Error: test_inline_img(Rpdf2txt::TestInlineImage): TypeError: can't convert nil into String test/test_pdf_object.rb:1797:in `exist?' test/test_pdf_object.rb:1797:in `test_inline_img' 2) Error: test_text_space_bug2(Rpdf2txt::TestPageLeaf): Errno::ENOENT: No such file or directory - /home/masa/ywesee/rpdf2txt/test/data/space_bug_stream2.txt test/test_pdf_object.rb:1665:in `read' test/test_pdf_object.rb:1665:in `test_text_space_bug2' 55 tests, 94 assertions, 0 failures, 2 errors
Notes
Comment out
=begin def test_text_space_bug2 stream = Stream.new path = File.expand_path('data/space_bug_stream2.txt', File.dirname(__FILE__)) fontsrc15 = <<-EOS 327 0 obj^M<</Subtype/Type1/FontDescriptor 325 0 R/LastChar 252/Widths[278 389 0 0 0 1000 722 278 333 333 556 600 278 333 278 278 556 556 556 556 556 556 556 556 556 556 278 278 0 600 0 0 0 722 611 611 722 556 500 722 722 278 389 667 500 944 722 778 556 778 611 500 556 722 667 1000 667 667 556 0 0 0 0 500 0 556 611 444 611 556 389 611 611 278 278 556 278 889 611 611 611 611 389 389 389 611 500 833 500 500 500 0 0 0 0 0 0 0 0 0 556 0 0 0 0 0 0 0 0 0 0 0 0 0 278 556 0 0 500 0 0 0 0 0 944 0 0 0 0 0 0 0 0 0 0 0 0 0 0 556 0 0 0 0 400 0 0 0 0 611 0 0 0 0 0 556 0 0 0 0 0 0 0 0 722 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 778 0 0 0 0 0 722 0 0 0 556 0 556 0 556 0 0 444 556 556 556 0 0 0 278 278 0 0 0 0 611 0 611 0 0 611 0 611 611]/BaseFont/PLWAZC+Frutiger-Roman/FirstChar 32/Encoding/WinAnsiEncoding/Type/Font>>^Mendobj EOS font15 = Font.new(fontsrc15) stream.decoded_stream = File.read path page = PageLeaf.new page.resources = resource = Resource.new resource.instance_variable_get('@fonts').store(:r15, font15) handler = SimpleHandler.new page.contents = [stream] page.text(handler) expected = "Inhalt / Table des mati\303\250res" assert_equal(expected.strip, handler.out.strip[0,28]) expected = '10 mg, 20 mg und 40 mg' assert_equal(expected.strip, handler.out.strip[346,22]) end =end
Note
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .......E.....................unknown encoding 370 0 R ... Finished in 4.981177 seconds. 1) Error: test_inline_img(Rpdf2txt::TestInlineImage): TypeError: can't convert nil into String test/test_pdf_object.rb:1797:in `exist?' test/test_pdf_object.rb:1797:in `test_inline_img' 54 tests, 94 assertions, 0 failures, 1 errors
class TestInlineImage < Test::Unit::TestCase def test_inline_img attrs = <<-EOS /W 113 /CS /DeviceGray /BPC 8 /DP << /Predictor 15 /Columns 113 >> /F /Fl /H 1 EOS data = "x\234cd\2407\000\000\000\344\000\002" obj = InlineImage.new(attrs, data) assert_nothing_raised { obj.image } path = File.expand_path('data/inline.png', File.dirname(__FILE__)) good = Magick::Image.read path tmp_path = Tempfile.new('test').path + '.png' obj.image.write tmp_path tmp = Magick::Image.read tmp_path assert_equal(good, tmp) rescue StandardError => e p e ensure File.delete tmp_path if File.exist? tmp_path end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 5.051024 seconds. 54 tests, 95 assertions, 0 failures, 0 errors
Note
suit.rb (all tests)
masa@masa ~/ywesee/rpdf2txt $ ruby test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R ............................................ Finished in 12.029007 seconds. 133 tests, 289 assertions, 0 failures, 0 errors
Next
add method test/test_pdf_parser.rb
def test_rebuild_object_catalogue file = File.expand_path('./data/encrypted_object_stream.pdf', File.dirname(__FILE__)) input = File.read(file) parser = Rpdf2txt::Parser.new(input) cat = parser.object_catalogue assert_equal(3, cat.length) assert_equal(cat[2545].class, Rpdf2txt::ObjStream) assert_equal(cat[3166].class, Rpdf2txt::TrailerDictionary) assert_equal(cat[2544].class, Rpdf2txt::PdfHash) parser.trailer_dictionary parser.rebuild_object_catalogue assert_equal(4, cat.length) assert_equal(cat[2530].class, Rpdf2txt::PdfHash) end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby test/test_pdf_parser.rb Loaded suite test/test_pdf_parser Started ....................... Finished in 3.785444 seconds. 23 tests, 69 assertions, 0 failures, 0 errors
Commit
cdrom::19:haldaemon,masa