<< | Index | >>
Default
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
Ruby 1.9
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:1:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: syntax error, unexpected '~', expecting ')'
super("EOF", "�~~��~^^~" + rand(1e10).inspect) ^
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: invalid multibyte char (US-ASCII)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:1:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb:1:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb:1:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:2:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:2:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:25:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:25:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:26:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:26:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:26:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:26:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `<top (required)>' from bin/rpdf2txt:25:in `require' from bin/rpdf2txt:25:in `<main>'
@]
Add magic comment (encoding) lib/rpdf2txt-rockit/rockit.rb
# enconding: ascii-8bit
lib/rpdf2txt-rockit/rockit_grammars_parser.rb
# enconding: ascii-8bit
Run again
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:30:in `require': no such file to load -- md5 (LoadError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:30:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `require' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `<top (required)>' from bin/rpdf2txt:25:in `require' from bin/rpdf2txt:25:in `<main>'
Note
Replace 'md5'
#require 'md5' require 'digest/md5'
#require 'md5' require 'digest/md5'
Run again
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:132:in `scan': invalid byte sequence in UTF-8 (ArgumentError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:132:in `build_object_catalogue' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:49:in `object_catalogue' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:164:in `page_tree_root' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:146:in `build_page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:58:in `<main>'
Change enconding forcedly (force_encoding)
def build_object_catalogue startobj=0 endobj=0 catalogue = {} @src.force_encoding('ascii-8bit') @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end catalogue end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/attributesparser.rb:38:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: syntax error, unexpected '~', expecting ')' t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/), ^ /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: invalid multibyte char (US-ASCII) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/attributesparser.rb:38:in `attributes_parser' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:77:in `_parse_attributes' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:84:in `parse_attributes' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:42:in `initialize' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:115:in `new' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:115:in `build_object' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:134:in `block in build_object_catalogue' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:133:in `scan' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:133:in `build_object_catalogue' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:49:in `object_catalogue' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:165:in `page_tree_root' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:147:in `build_page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:58:in `<main>'
Set ascii-8bit
lib/rpdf2txt/data/pdfattributes.rb
# enconding: ascii-8bit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:446:in `extract_oids': undefined method `collect' for "5 0 R":String (NoMethodError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:458:in `build_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:147:in `build_page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:58:in `<main>'
Note
Experiment
lib/rpdf2txt/object.rb#extract_oids
def extract_oids(array) print "array=" p array if array.class != Array array = [array] end result = array.collect{ |dirty_id| if(match = /\d+/on.match(dirty_id)) match[0].to_i end }.compact print "result=" p result return result end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf array="5 0 R" result=[5] array=["4 0 R"] result=[4] array="2 0 R" result=[2] array=["6 0 R"] result=[6] untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf array="5 0 R" result=[5] array=["4 0 R"] result=[4] array="2 0 R" result=[2] array=["6 0 R"] result=[6] 'incorrect header check' when filtering with /FlateDecode /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:729:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:58:in `<main>'
Note
Experiment
lib/rpdf2txt/object.rb#extract_text_objects
def extract_text_objects(page, text_state) @page, @text_state = page, text_state stack = [] result = [] startpoint = decoded_stream.index(BT_PATTERN) endpoint = decoded_stream.index(ET_PATTERN) print "decoded_stream.size=" p decoded_stream.size if RUBY_VERSION.to_f >= 1.9 print "decoded_stream.encoding=" p decoded_stream.encoding end print "endpoint=" p endpoint
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf 'incorrect header check' when filtering with /FlateDecode decoded_stream.size=688 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=nil /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:737:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:58:in `<main>'
Note
Hypothesis
suspend
parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8')
Note
FlateDecode error
'incorrect header check' when filtering with /FlateDecode
Note
lib/rpdf2txt/object.rb#decode_raw_stream
def decode_raw_stream @decrypted_stream = raw_stream unless(@decoder.nil?) @decrypted_stream = @decoder.decrypt(self) end stream = @decrypted_stream attributes[:filter]].flatten.compact.each { |filter| begin stream = case filter when "/FlateDecode" flate_decode stream when "/LZWDecode" lzw_decode stream else raise "Unimplemented filter: #{filter}" end rescue StandardError => err '''@@warn "'#{err.message}' when filtering with #{filter}" #<= HERE@@''' end } stream end Experiment [[http://scm.ywesee.com/?p=rpdf2txt/.git;a=blob;f=lib/rpdf2txt/object.rb;h=4842f2a7fbd7d2bd077f9d6a72f04dffbdb71b84;hb=HEAD#l758|lib/rpdf2txt/object.rb#raw_stream]] [@ def raw_stream print "@src.size=" p @src.size @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf @src.size=328 decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf @src.size=328 'incorrect header check' when filtering with /FlateDecode decoded_stream.size=688 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=nil /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:737:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:60:in `<main>'
Note
Experiment
test.rb
# encoding: ascii-8bit require 'zlib' @src = File.read('test.pdf') if RUBY_VERSION.to_f >= 1.9 @src.force_encoding('ascii-8bit') end @raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s print "@raw_stream.size=" p @raw_stream.size p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb @raw_stream.size=256 "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb @raw_stream.size=688 test.rb:12:in `inflate': incorrect header check (Zlib::DataError) from test.rb:12:in `<main>'
Note
Experiment
test.rb
require 'zlib' @src = File.read('test.pdf') #open('test.pdf','rb') do |f| # @src=f.read #end if RUBY_VERSION.to_f >= 1.9 @src.force_encoding('ascii-8bit') end #@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s @raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0] p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
Note
Experimnt
test.rb
require 'zlib' #@src = File.read('test.pdf') open('test.pdf','rb') do |f| @src=f.read end #if RUBY_VERSION.to_f >= 1.9 # @src.force_encoding('ascii-8bit') #end #@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s @raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0] p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
Note
Experiment
lib/rpdf2txt/object.rb#raw_stream
def raw_stream #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0] end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=117 BT 10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:38:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: syntax error, unexpected '~', expecting ')' t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/), ^ /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: invalid multibyte char (US-ASCII) from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:38:in `text_parser' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:74:in `scan' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:768:in `extract_text_objects' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:543:in `text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:490:in `block (2 levels) in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:445:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:490:in `block in each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:489:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:489:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:474:in `each' from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text' from bin/rpdf2txt:60:in `<main>'
Note
Add magic comment
# encodnig: ascii-8bit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
success!!
Experiment
stream = open(ARGV[0], 'rb') do |file| file.read end #parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8') parser = Rpdf2txt::Parser.new(stream, 'utf-8')
lib/rpdf2txt/parser.rb#build_object_catalogue
def build_object_catalogue startobj=0 endobj=0 catalogue = {} if RUBY_VERSION.to_f >= 1.9 #@src.force_encoding('ascii-8bit') print "@src.encoding=" p @src.encoding end @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end catalogue end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf @src.encoding=#<Encoding:ASCII-8BIT> decoded_stream.size=559 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
Check the current status
masa@masa ~/ywesee/rpdf2txt $ ruby18 test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R ............................................. Finished in 12.540428 seconds. 134 tests, 295 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored test/suite.rb:29:in `require': /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:177: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: Invalid char `\x0F' in expression /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: invalid multibyte char (US-ASCII) /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: syntax error, unexpected $end, expecting keyword_end /Title (���)��\\���#/�-&��;S��A) ^ from test/suite.rb:29:in `block in <main>' from test/suite.rb:28:in `foreach' from test/suite.rb:28:in `<main>'
Note
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/mock.rb:19:in `require': no such file to load -- runit/error (LoadError) from /home/masa/ywesee/rpdf2txt/test/mock.rb:19:in `<top (required)>' from /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:32:in `require' from /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:32:in `<top (required)>' from test/suite.rb:29:in `require' from test/suite.rb:29:in `block in <main>' from test/suite.rb:28:in `foreach' from test/suite.rb:28:in `<main>'
Replace 'runit/error' to 'test/unit'
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored test/suite.rb:29:in `require': /home/masa/ywesee/rpdf2txt/test/test_pdf_text.rb:460: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/test/test_pdf_text.rb:372: syntax error, unexpected $end, expecting tSTRING_CONTENT or tSTRING_DBEG or tSTRING_DVAR or tSTRING_END from test/suite.rb:29:in `block in <main>' from test/suite.rb:28:in `foreach' from test/suite.rb:28:in `<main>'
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb ... 81) Error: test_txt(TestTextState): Encoding::CompatibilityError: incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `[]' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `set_txt' /home/masa/ywesee/rpdf2txt/test/test_text_state.rb:312:in `test_txt' Finished in 4.273717702 seconds. 134 tests, 145 assertions, 8 failures, 73 errors, 0 pendings, 0 omissions, 0 notifications 39.5522% passed
Note
1) Error: test_cmap_bfchar(Rpdf2txt::TestCmap): ArgumentError: cannot make an element from nil /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:82:in `make_element' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `block in make_elements' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `make_elements' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:361:in `initialize' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `new' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `prod' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:134:in `block in eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:132:in `eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `block in eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:115:in `rockit_productions_eval' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:180:in `rockit_grammar_eval' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:47:in `block in generate_parser_from_file_to_file' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `call' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `time_and_puts' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:46:in `generate_parser_from_file_to_file' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/cmapparser.rb:44:in `cmap_parser' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1062:in `parse_cmap' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1000:in `initialize' /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:102:in `new' /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:102:in `test_cmap_bfchar'
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started . Finished in 0.007427187 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_parser_grammar_bfrange(Rpdf2txt::TestCmap): ArgumentError: cannot make an element from nil /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:82:in `make_element' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `block in make_elements' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `make_elements' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:361:in `initialize' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `new' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `prod' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:134:in `block in eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:132:in `eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `block in eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `eval_ast' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:115:in `rockit_productions_eval' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:180:in `rockit_grammar_eval' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:47:in `block in generate_parser_from_file_to_file' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `call' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `time_and_puts' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:46:in `generate_parser_from_file_to_file' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/cmapparser.rb:58:in `cmap_range_parser' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1066:in `parse_cmap' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1000:in `initialize' test/test_pdf_object.rb:108:in `new' test/test_pdf_object.rb:108:in `test_parser_grammar_bfrange' Finished in 0.021427573 seconds.
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started .... Finished in 0.013319168 seconds. 4 tests, 6 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_decrypt(Rpdf2txt::TestEncrypt): ArgumentError: invalid byte sequence in UTF-8 /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:292:in `check' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:292:in `block in peek' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:291:in `each' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:291:in `peek' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/glr_parser.rb:133:in `actor' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/glr_parser.rb:70:in `parse' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:79:in `_parse_attributes' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:86:in `parse_attributes' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:44:in `initialize' test/test_pdf_object.rb:536:in `new' test/test_pdf_object.rb:536:in `setup'
Replace 'File.read' to 'open with rb mode'
class TestEncrypt < Test::Unit::TestCase def setup file = File.expand_path('./data/encrypt_string', File.dirname(__FILE__)) #src_encrypt_obj = File.read(file) src_encrypt_obj = open(file, 'rb'){|file| file.read} @encrypt = Rpdf2txt::PdfEncrypt.new(src_encrypt_obj) @encrypt.file_id = '8664e6986751f2a49dccc9a4b40a4f18' end def test_decrypt file = File.expand_path('./data/working_obj', File.dirname(__FILE__)) #input = File.read(file) input = open(file, 'rb'){|file| file.read} pdf_obj = Rpdf2txt::Stream.new(input) assert_equal("dc08b36009e48618f99c", @encrypt.decrypt_key(pdf_obj).unpack('h*').first) #if the stream could be inflated, the decryption is ok! assert_nothing_raised{ Zlib::Inflate.inflate(@encrypt.decrypt(pdf_obj)) Zlib::Inflate.inflate(@encrypt.decrypt(pdf_obj)) } end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started .......................unknown encoding 370 0 R .. Finished in 0.448861627 seconds. 25 tests, 52 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
All 'File.read' are replaced
Next
test_width(Rpdf2txt::TestFont) [test/test_pdf_object.rb:753]: <278> expected but was <nil>
def width(char) if(char.is_a?(String) && char.length == 1) #char = char[0]if RUBY_VERSION > "1.9"
char = char.bytes.to_a[0]
else
char = char[0]
end
end _width(char) || named_width(char) end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ...................................unknown encoding 370 0 R ... Finished in 0.665297144 seconds. 38 tests, 69 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_text__fixed_double_lead_bug(Rpdf2txt::TestPageLeaf): Encoding::CompatibilityError: incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `[]' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `set_txt' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:173:in `_snip' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:170:in `snip' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:144:in `block (2 levels) in scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `each' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `block in scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `each' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:82:in `scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:142:in `block (2 levels) in scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `each' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `block in scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `each' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:82:in `scan_tree' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:75:in `scan' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:776:in `extract_text_objects' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:551:in `text' test/test_pdf_object.rb:1223:in `test_text__fixed_double_lead_bug'
lib/rpdf2txt/text_state.rb#set_txt
def set_txt(txt) #call the unescape_txt method, #so that \334 is replaced by char Ü #otherwise the calculation of the string width is wrong!!!! unescape_txt!(txt) @boxwidth = 0 txt.rstrip.each_byte do |char| @boxwidth += char_width(char) end @w = @boxwidth #if white = txt[/\s+$/u] if white = txt[/\s+$/n] white.each_byte do |char| @w += char_width(char) end end @txt = recode_txt(txt) end
Note
IMPORTANT
: in Ruby1.9, we cannot compare different character codes
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ....................'invalid literal/lengths set' when filtering with /FlateDecode ...................unknown encoding 370 0 R ... Finished in 0.827054215 seconds. 42 tests, 75 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check All the test (test_pdf_object.rb)
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 5.157712 seconds. 54 tests, 95 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 4.268063275 seconds. 54 tests, 95 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_space_bug_05_2004.rb Loaded suite test/test_space_bug_05_2004 Started . Finished in 0.233933132 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_stream.rb Loaded suite test/test_stream Started .......... Finished in 0.691126396 seconds. 10 tests, 14 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
Next
test_extract_TD(Rpdf2txt::TestText): NameError: uninitialized constant Mock::RUNIT /home/masa/ywesee/rpdf2txt/test/mock.rb:108:in `__mock_call' /home/masa/ywesee/rpdf2txt/test/mock.rb:73:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:178:in `text_state=' test/test_pdf_text.rb:100:in `test_extract_TD'
Experiment
test.rb
def foo(&block) block.arity end p foo{} p foo{||} p foo{|x|} p foo{|*x|} p foo{|x, y|} p foo{|x, *y|} p foo{|(x, y)|} p foo{|(x, y), z|}
Result
masa@masa ~/work $ ruby18 test.rb-1
0 1 -1 2 -22
2 masa@masa ~/work $ ruby1.9 test.rb0
0 1 -1 2 -21
2
Note
Experiment
def test_extract_Tc #PDF doc: Tc always has 1 operand @text.src = <<-EOS BT -0.0002 Tc ET EOS ast = Rpdf2txt.text_parser.parse(@text.src) assert_equal("-0.0002", ast.values.first.charspace.value) text_state = Mock.new("text_state") text_state.__next(:transformation_matrix=) {} text_state.__next(:transformation_matrix=) {|*x|} @text.text_state = text_state
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started E 1) Error: test_extract_Tc(Rpdf2txt::TestText): NoMethodError: undefined method `to_i' for :transformation_matrix=:Symbol /home/masa/ywesee/rpdf2txt/test/mock.rb:171:in `__pre' /home/masa/ywesee/rpdf2txt/test/mock.rb:147:in `__mock_call' /home/masa/ywesee/rpdf2txt/test/mock.rb:88:in `method_missing' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:178:in `text_state=' test/test_pdf_text.rb:86:in `test_extract_Tc' Finished in 0.005110468 seconds. 1 tests, 1 assertions, 0 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications 0% passed
Note
Experiment
def Mock.__pre( method ) #"__pre_#{method.to_i}".intern "__pre_#{method}".intern end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started . Finished in 0.005063328 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Test all the cases (test_pdf_text)
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started ............................. Finished in 1.048242 seconds. 29 tests, 37 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started ............................. Finished in 0.645440965 seconds. 29 tests, 37 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Final check
masa@masa ~/ywesee/rpdf2txt $ ruby18 test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R .........#<Rpdf2txt::CMap:0x7f51de715f48 @target_encoding="utf8", @decoded_stream="", @decrypted_stream="", @src="<< >>", @raw_stream="", @map={}, @attributes={}> ....................... Finished in 9.191181 seconds. 121 tests, 277 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R .........E...................... Finished in 9.105659 seconds. 1) Error: test_join_snippets__hex_chars(TestParser): NoMethodError: undefined method `[]' for nil:NilClass ./lib/rpdf2txt/object.rb:787:in `raw_stream' ./lib/rpdf2txt/object.rb:790:in `decode_raw_stream' ./lib/rpdf2txt/object.rb:682:in `decoded_stream' ./lib/rpdf2txt/object.rb:1050:in `extract_bfchar' ./lib/rpdf2txt/object.rb:1069:in `parse_cmap' ./lib/rpdf2txt/object.rb:1008:in `initialize' /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `new' /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `test_join_snippets__hex_chars' 121 tests, 277 assertions, 0 failures, 1 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/suite.rb test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ......................................................... ..........unknown encoding 370 0 R .........E........F............. 1) Error: test_join_snippets__hex_chars(TestParser): NoMethodError: undefined method `each' for nil:NilClass /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:107:in `extract_attributes' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:88:in `parse_attributes' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:44:in `initialize' /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1007:in `initialize' /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `new' /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `test_join_snippets__hex_chars' 2) Failure: test_char_width(TestTextState) [/home/masa/ywesee/rpdf2txt/test/test_text_state.rb:303]: <0.313> expected but was <0.301> diff: ? 0.3013 Finished in 7.296597714 seconds. 121 tests, 277 assertions, 1 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications 98.3471% passed
Note