<< | Index | >>
Default

masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
Ruby 1.9
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:1:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: invalid multibyte char (US-ASCII) (SyntaxError) /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: syntax error, unexpected '~', expecting ')'
super("EOF", "�~~��~^^~" + rand(1e10).inspect)
^
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:138: invalid multibyte char (US-ASCII)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:1:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb:1:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb:1:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:2:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:2:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:25:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:25:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:26:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:26:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:26:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:26:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `<top (required)>'
from bin/rpdf2txt:25:in `require'
from bin/rpdf2txt:25:in `<main>'
@]
Add magic comment (encoding) lib/rpdf2txt-rockit/rockit.rb
# enconding: ascii-8bit
lib/rpdf2txt-rockit/rockit_grammars_parser.rb
# enconding: ascii-8bit
Run again
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:30:in `require': no such file to load -- md5 (LoadError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:30:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `require'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:26:in `<top (required)>'
from bin/rpdf2txt:25:in `require'
from bin/rpdf2txt:25:in `<main>'
Note
Replace 'md5'
#require 'md5' require 'digest/md5'
#require 'md5' require 'digest/md5'
Run again
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:132:in `scan': invalid byte sequence in UTF-8 (ArgumentError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:132:in `build_object_catalogue'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:49:in `object_catalogue'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:164:in `page_tree_root'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:146:in `build_page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:58:in `<main>'
Change enconding forcedly (force_encoding)
def build_object_catalogue
startobj=0
endobj=0
catalogue = {}
@src.force_encoding('ascii-8bit')
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
obj = build_object(match.to_s)
catalogue.store(obj.oid, obj)
end
catalogue
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/attributesparser.rb:38:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: invalid multibyte char (US-ASCII) (SyntaxError)
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: syntax error, unexpected '~', expecting ')'
t1 = EofToken.new("EOF",/^(�~~��~^^~2411366330)/),
^
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdfattributes.rb:9: invalid multibyte char (US-ASCII)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/attributesparser.rb:38:in `attributes_parser'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:77:in `_parse_attributes'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:84:in `parse_attributes'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:42:in `initialize'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:115:in `new'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:115:in `build_object'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:134:in `block in build_object_catalogue'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:133:in `scan'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:133:in `build_object_catalogue'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:49:in `object_catalogue'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:165:in `page_tree_root'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:147:in `build_page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:58:in `<main>'
Set ascii-8bit
lib/rpdf2txt/data/pdfattributes.rb
# enconding: ascii-8bit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:446:in `extract_oids': undefined method `collect' for "5 0 R":String (NoMethodError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:458:in `build_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:147:in `build_page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:52:in `page_tree'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:58:in `<main>'
Note
Experiment
lib/rpdf2txt/object.rb#extract_oids
def extract_oids(array)
print "array="
p array
if array.class != Array
array = [array]
end
result = array.collect{ |dirty_id|
if(match = /\d+/on.match(dirty_id))
match[0].to_i
end
}.compact
print "result="
p result
return result
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf array="5 0 R" result=[5] array=["4 0 R"] result=[4] array="2 0 R" result=[2] array=["6 0 R"] result=[6] untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
array="5 0 R"
result=[5]
array=["4 0 R"]
result=[4]
array="2 0 R"
result=[2]
array=["6 0 R"]
result=[6]
'incorrect header check' when filtering with /FlateDecode
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:729:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:58:in `<main>'
Note
Experiment
lib/rpdf2txt/object.rb#extract_text_objects
def extract_text_objects(page, text_state)
@page, @text_state = page, text_state
stack = []
result = []
startpoint = decoded_stream.index(BT_PATTERN)
endpoint = decoded_stream.index(ET_PATTERN)
print "decoded_stream.size="
p decoded_stream.size
if RUBY_VERSION.to_f >= 1.9
print "decoded_stream.encoding="
p decoded_stream.encoding
end
print "endpoint="
p endpoint
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
'incorrect header check' when filtering with /FlateDecode
decoded_stream.size=688
decoded_stream.encoding=#<Encoding:US-ASCII>
endpoint=nil
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:737:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:58:in `<main>'
Note
Hypothesis
suspend
parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8')
Note
FlateDecode error
'incorrect header check' when filtering with /FlateDecode
Note
lib/rpdf2txt/object.rb#decode_raw_stream
def decode_raw_stream
@decrypted_stream = raw_stream
unless(@decoder.nil?)
@decrypted_stream = @decoder.decrypt(self)
end
stream = @decrypted_stream
attributes[:filter]].flatten.compact.each { |filter|
begin
stream = case filter
when "/FlateDecode"
flate_decode stream
when "/LZWDecode"
lzw_decode stream
else
raise "Unimplemented filter: #{filter}"
end
rescue StandardError => err
'''@@warn "'#{err.message}' when filtering with #{filter}" #<= HERE@@'''
end
}
stream
end
Experiment
[[http://scm.ywesee.com/?p=rpdf2txt/.git;a=blob;f=lib/rpdf2txt/object.rb;h=4842f2a7fbd7d2bd077f9d6a72f04dffbdb71b84;hb=HEAD#l758|lib/rpdf2txt/object.rb#raw_stream]]
[@
def raw_stream
print "@src.size="
p @src.size
@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf @src.size=328 decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
@src.size=328
'incorrect header check' when filtering with /FlateDecode
decoded_stream.size=688
decoded_stream.encoding=#<Encoding:US-ASCII>
endpoint=nil
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:737:in `extract_text_objects': undefined method `+' for nil:NilClass (NoMethodError)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:541:in `text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block (2 levels) in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:443:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:488:in `block in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:487:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:472:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:60:in `<main>'
Note
Experiment
test.rb
# encoding: ascii-8bit
require 'zlib'
@src = File.read('test.pdf')
if RUBY_VERSION.to_f >= 1.9
@src.force_encoding('ascii-8bit')
end
@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
print "@raw_stream.size="
p @raw_stream.size
p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb @raw_stream.size=256 "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb
@raw_stream.size=688
test.rb:12:in `inflate': incorrect header check (Zlib::DataError)
from test.rb:12:in `<main>'
Note
Experiment
test.rb
require 'zlib'
@src = File.read('test.pdf')
#open('test.pdf','rb') do |f|
# @src=f.read
#end
if RUBY_VERSION.to_f >= 1.9
@src.force_encoding('ascii-8bit')
end
#@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
Note
Experimnt
test.rb
require 'zlib'
#@src = File.read('test.pdf')
open('test.pdf','rb') do |f|
@src=f.read
end
#if RUBY_VERSION.to_f >= 1.9
# @src.force_encoding('ascii-8bit')
#end
#@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
@raw_stream = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
p Zlib::Inflate.inflate(@raw_stream)
Result
masa@masa ~/work $ ruby18 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
masa@masa ~/work $ ruby1.9 test.rb "q Q q 18 40 576 734 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT\n10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET Q q 1 0 0 -1 18\n774 cm BT 10 0 0 -10 0 12 Tm /F1.0 1 Tf (untitled text) Tj ET Q q 1\n0 0 -1 18 774 cm BT 10 0 0 -10 0 24 Tm /F1.0 1 Tf (Printed: Donnerstag, 14. November 2002 14:04:29 Uhr)\nTj ET Q 0.25 w /Cs1 CS 0 0 0 SC q 1 0 0 -1 18 774 cm 0 36.125 m 576.25\n36.125 l S Q q 1 0 0 -1 18 774 cm 0 34.125 m 576.25 34.125 l S Q Q\nq 18 40 576 694 re W n /Cs1 cs 0 0 0 sc q 1 0 0 -1 18 774 cm BT 10\n0 0 -10 1 52 Tm /F2.0 1 Tf (testpdf) Tj ET Q Q"
Note
Experiment
lib/rpdf2txt/object.rb#raw_stream
def raw_stream
#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf
decoded_stream.size=559
decoded_stream.encoding=#<Encoding:US-ASCII>
endpoint=117
BT
10 0 0 -10 510 12 Tm /F1.0 1 Tf (Page 1 of 1) Tj ET
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:38:in `require': /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: invalid multibyte char (US-ASCII) (SyntaxError)
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: syntax error, unexpected '~', expecting ')'
t1 = EofToken.new("EOF",/^(�~~��~^^~3062921542)/),
^
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/data/pdftext.rb:9: invalid multibyte char (US-ASCII)
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/textparser.rb:38:in `text_parser'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:74:in `scan'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:768:in `extract_text_objects'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:543:in `text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:43:in `block in extract_text'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:490:in `block (2 levels) in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:445:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:490:in `block in each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:489:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:489:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:474:in `each'
from /home/masa/ywesee/rpdf2txt/lib/rpdf2txt/parser.rb:42:in `extract_text'
from bin/rpdf2txt:60:in `<main>'
Note
Add magic comment
# encodnig: ascii-8bit
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
success!!
Experiment
stream = open(ARGV[0], 'rb') do |file|
file.read
end
#parser = Rpdf2txt::Parser.new(File.read(ARGV[0]), 'utf8')
parser = Rpdf2txt::Parser.new(stream, 'utf-8')
lib/rpdf2txt/parser.rb#build_object_catalogue
def build_object_catalogue
startobj=0
endobj=0
catalogue = {}
if RUBY_VERSION.to_f >= 1.9
#@src.force_encoding('ascii-8bit')
print "@src.encoding="
p @src.encoding
end
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
obj = build_object(match.to_s)
catalogue.store(obj.oid, obj)
end
catalogue
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib bin/rpdf2txt test/data/test.pdf decoded_stream.size=559 endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib bin/rpdf2txt test/data/test.pdf @src.encoding=#<Encoding:ASCII-8BIT> decoded_stream.size=559 decoded_stream.encoding=#<Encoding:US-ASCII> endpoint=117 untitled text Page 1 of 1 Printed: Donnerstag, 14. November 2002 14:04:29 Uhr testpdf
Note
Check the current status
masa@masa ~/ywesee/rpdf2txt $ ruby18 test/suite.rb Loaded suite test/suite Started ......................'invalid literal/lengths set' when filtering with /FlateDecode ...................................................................unknown encoding 370 0 R ............................................. Finished in 12.540428 seconds. 134 tests, 295 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb
test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored
test/suite.rb:29:in `require': /home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:177: invalid multibyte char (US-ASCII) (SyntaxError)
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: Invalid char `\x0F' in expression
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: invalid multibyte char (US-ASCII)
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:174: syntax error, unexpected $end, expecting keyword_end
/Title (���)��\\���#/�-&��;S��A)
^
from test/suite.rb:29:in `block in <main>'
from test/suite.rb:28:in `foreach'
from test/suite.rb:28:in `<main>'
Note
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb
test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/mock.rb:19:in `require': no such file to load -- runit/error (LoadError)
from /home/masa/ywesee/rpdf2txt/test/mock.rb:19:in `<top (required)>'
from /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:32:in `require'
from /home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:32:in `<top (required)>'
from test/suite.rb:29:in `require'
from test/suite.rb:29:in `block in <main>'
from test/suite.rb:28:in `foreach'
from test/suite.rb:28:in `<main>'
Replace 'runit/error' to 'test/unit'
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb
test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored
test/suite.rb:29:in `require': /home/masa/ywesee/rpdf2txt/test/test_pdf_text.rb:460: invalid multibyte char (US-ASCII) (SyntaxError)
/home/masa/ywesee/rpdf2txt/test/test_pdf_text.rb:372: syntax error, unexpected $end, expecting tSTRING_CONTENT or tSTRING_DBEG or tSTRING_DVAR or tSTRING_END
from test/suite.rb:29:in `block in <main>'
from test/suite.rb:28:in `foreach'
from test/suite.rb:28:in `<main>'
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 test/suite.rb
...
81) Error:
test_txt(TestTextState):
Encoding::CompatibilityError: incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string)
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `[]'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `set_txt'
/home/masa/ywesee/rpdf2txt/test/test_text_state.rb:312:in `test_txt'
Finished in 4.273717702 seconds.
134 tests, 145 assertions, 8 failures, 73 errors, 0 pendings, 0 omissions, 0 notifications
39.5522% passed
Note
1) Error:
test_cmap_bfchar(Rpdf2txt::TestCmap):
ArgumentError: cannot make an element from nil
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:82:in `make_element'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `block in make_elements'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `make_elements'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:361:in `initialize'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `new'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `prod'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:134:in `block in eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:132:in `eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `block in eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:115:in `rockit_productions_eval'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:180:in `rockit_grammar_eval'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:47:in `block in generate_parser_from_file_to_file'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `call'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `time_and_puts'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:46:in `generate_parser_from_file_to_file'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/cmapparser.rb:44:in `cmap_parser'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1062:in `parse_cmap'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1000:in `initialize'
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:102:in `new'
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:102:in `test_cmap_bfchar'
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started . Finished in 0.007427187 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_parser_grammar_bfrange(Rpdf2txt::TestCmap):
ArgumentError: cannot make an element from nil
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:82:in `make_element'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `block in make_elements'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:92:in `make_elements'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:361:in `initialize'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `new'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/grammar.rb:484:in `prod'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:134:in `block in eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:132:in `eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `block in eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `map'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:129:in `eval_ast'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:115:in `rockit_productions_eval'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb:180:in `rockit_grammar_eval'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:47:in `block in generate_parser_from_file_to_file'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `call'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/base_extensions.rb:65:in `time_and_puts'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/rockit.rb:46:in `generate_parser_from_file_to_file'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/cmapparser.rb:58:in `cmap_range_parser'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1066:in `parse_cmap'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1000:in `initialize'
test/test_pdf_object.rb:108:in `new'
test/test_pdf_object.rb:108:in `test_parser_grammar_bfrange'
Finished in 0.021427573 seconds.
Add magic comment ('# encoding: ascii-8bit')
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started .... Finished in 0.013319168 seconds. 4 tests, 6 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_decrypt(Rpdf2txt::TestEncrypt):
ArgumentError: invalid byte sequence in UTF-8
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:292:in `check'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:292:in `block in peek'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:291:in `each'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/token.rb:291:in `peek'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/glr_parser.rb:133:in `actor'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/glr_parser.rb:70:in `parse'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:79:in `_parse_attributes'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:86:in `parse_attributes'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:44:in `initialize'
test/test_pdf_object.rb:536:in `new'
test/test_pdf_object.rb:536:in `setup'
Replace 'File.read' to 'open with rb mode'
class TestEncrypt < Test::Unit::TestCase
def setup
file = File.expand_path('./data/encrypt_string', File.dirname(__FILE__))
#src_encrypt_obj = File.read(file)
src_encrypt_obj = open(file, 'rb'){|file| file.read}
@encrypt = Rpdf2txt::PdfEncrypt.new(src_encrypt_obj)
@encrypt.file_id = '8664e6986751f2a49dccc9a4b40a4f18'
end
def test_decrypt
file = File.expand_path('./data/working_obj', File.dirname(__FILE__))
#input = File.read(file)
input = open(file, 'rb'){|file| file.read}
pdf_obj = Rpdf2txt::Stream.new(input)
assert_equal("dc08b36009e48618f99c", @encrypt.decrypt_key(pdf_obj).unpack('h*').first)
#if the stream could be inflated, the decryption is ok!
assert_nothing_raised{
Zlib::Inflate.inflate(@encrypt.decrypt(pdf_obj))
Zlib::Inflate.inflate(@encrypt.decrypt(pdf_obj))
}
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started .......................unknown encoding 370 0 R .. Finished in 0.448861627 seconds. 25 tests, 52 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
All 'File.read' are replaced
Next
test_width(Rpdf2txt::TestFont) [test/test_pdf_object.rb:753]: <278> expected but was <nil>
def width(char)
if(char.is_a?(String) && char.length == 1)
#char = char[0]
if RUBY_VERSION > "1.9"
char = char.bytes.to_a[0]
else
char = char[0]
end
end
_width(char) || named_width(char)
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ...................................unknown encoding 370 0 R ... Finished in 0.665297144 seconds. 38 tests, 69 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Next
test_text__fixed_double_lead_bug(Rpdf2txt::TestPageLeaf):
Encoding::CompatibilityError: incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string)
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `[]'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text_state.rb:309:in `set_txt'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:173:in `_snip'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:170:in `snip'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:144:in `block (2 levels) in scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `each'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `block in scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `each'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:82:in `scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:142:in `block (2 levels) in scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `each'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:94:in `block in scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `each'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:337:in `rescue in method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt-rockit/syntax_tree.rb:333:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:82:in `scan_tree'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:75:in `scan'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:776:in `extract_text_objects'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:551:in `text'
test/test_pdf_object.rb:1223:in `test_text__fixed_double_lead_bug'
lib/rpdf2txt/text_state.rb#set_txt
def set_txt(txt)
#call the unescape_txt method,
#so that \334 is replaced by char Ü
#otherwise the calculation of the string width is wrong!!!!
unescape_txt!(txt)
@boxwidth = 0
txt.rstrip.each_byte do |char|
@boxwidth += char_width(char)
end
@w = @boxwidth
#if white = txt[/\s+$/u]
if white = txt[/\s+$/n]
white.each_byte do |char|
@w += char_width(char)
end
end
@txt = recode_txt(txt)
end
Note
IMPORTANT: in Ruby1.9, we cannot compare different character codes
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ....................'invalid literal/lengths set' when filtering with /FlateDecode ...................unknown encoding 370 0 R ... Finished in 0.827054215 seconds. 42 tests, 75 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check All the test (test_pdf_object.rb)
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/test_pdf_object.rb Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 5.157712 seconds. 54 tests, 95 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_object.rb test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored Loaded suite test/test_pdf_object Started ......................'invalid literal/lengths set' when filtering with /FlateDecode .............................unknown encoding 370 0 R ... Finished in 4.268063275 seconds. 54 tests, 95 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_space_bug_05_2004.rb Loaded suite test/test_space_bug_05_2004 Started . Finished in 0.233933132 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_stream.rb Loaded suite test/test_stream Started .......... Finished in 0.691126396 seconds. 10 tests, 14 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Check
Next
test_extract_TD(Rpdf2txt::TestText):
NameError: uninitialized constant Mock::RUNIT
/home/masa/ywesee/rpdf2txt/test/mock.rb:108:in `__mock_call'
/home/masa/ywesee/rpdf2txt/test/mock.rb:73:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:178:in `text_state='
test/test_pdf_text.rb:100:in `test_extract_TD'
Experiment
test.rb
def foo(&block)
block.arity
end
p foo{}
p foo{||}
p foo{|x|}
p foo{|*x|}
p foo{|x, y|}
p foo{|x, *y|}
p foo{|(x, y)|}
p foo{|(x, y), z|}
Result
masa@masa ~/work $ ruby18 test.rb-10 1 -1 2 -222 masa@masa ~/work $ ruby1.9 test.rb00 1 -1 2 -212
Note
Experiment
def test_extract_Tc
#PDF doc: Tc always has 1 operand
@text.src = <<-EOS
BT
-0.0002 Tc
ET
EOS
ast = Rpdf2txt.text_parser.parse(@text.src)
assert_equal("-0.0002", ast.values.first.charspace.value)
text_state = Mock.new("text_state")
text_state.__next(:transformation_matrix=) {}
text_state.__next(:transformation_matrix=) {|*x|}
@text.text_state = text_state
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb
Loaded suite test/test_pdf_text
Started
E
1) Error:
test_extract_Tc(Rpdf2txt::TestText):
NoMethodError: undefined method `to_i' for :transformation_matrix=:Symbol
/home/masa/ywesee/rpdf2txt/test/mock.rb:171:in `__pre'
/home/masa/ywesee/rpdf2txt/test/mock.rb:147:in `__mock_call'
/home/masa/ywesee/rpdf2txt/test/mock.rb:88:in `method_missing'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/text.rb:178:in `text_state='
test/test_pdf_text.rb:86:in `test_extract_Tc'
Finished in 0.005110468 seconds.
1 tests, 1 assertions, 0 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications
0% passed
Note
Experiment
def Mock.__pre( method )
#"__pre_#{method.to_i}".intern
"__pre_#{method}".intern
end
Result
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started . Finished in 0.005063328 seconds. 1 tests, 1 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Test all the cases (test_pdf_text)
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started ............................. Finished in 1.048242 seconds. 29 tests, 37 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/test_pdf_text.rb Loaded suite test/test_pdf_text Started ............................. Finished in 0.645440965 seconds. 29 tests, 37 assertions, 0 failures, 0 errors, 0 pendings, 0 omissions, 0 notifications 100% passed
Note
Final check
masa@masa ~/ywesee/rpdf2txt $ ruby18 test/suite.rb
Loaded suite test/suite
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
...................................................................unknown encoding 370 0 R
.........#<Rpdf2txt::CMap:0x7f51de715f48 @target_encoding="utf8", @decoded_stream="", @decrypted_stream="", @src="<< >>", @raw_stream="", @map={}, @attributes={}>
.......................
Finished in 9.191181 seconds.
121 tests, 277 assertions, 0 failures, 0 errors
masa@masa ~/ywesee/rpdf2txt $ ruby18 -I lib test/suite.rb
Loaded suite test/suite
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
...................................................................unknown encoding 370 0 R
.........E......................
Finished in 9.105659 seconds.
1) Error:
test_join_snippets__hex_chars(TestParser):
NoMethodError: undefined method `[]' for nil:NilClass
./lib/rpdf2txt/object.rb:787:in `raw_stream'
./lib/rpdf2txt/object.rb:790:in `decode_raw_stream'
./lib/rpdf2txt/object.rb:682:in `decoded_stream'
./lib/rpdf2txt/object.rb:1050:in `extract_bfchar'
./lib/rpdf2txt/object.rb:1069:in `parse_cmap'
./lib/rpdf2txt/object.rb:1008:in `initialize'
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `new'
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `test_join_snippets__hex_chars'
121 tests, 277 assertions, 0 failures, 1 errors
masa@masa ~/ywesee/rpdf2txt $ ruby1.9 -I lib test/suite.rb
test/suite.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_object.rb:26: warning: variable $KCODE is no longer effective; ignored
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:28: warning: variable $KCODE is no longer effective; ignored
Loaded suite test/suite
Started
......................'invalid literal/lengths set' when filtering with /FlateDecode
.........................................................
..........unknown encoding 370 0 R
.........E........F.............
1) Error:
test_join_snippets__hex_chars(TestParser):
NoMethodError: undefined method `each' for nil:NilClass
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:107:in `extract_attributes'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:88:in `parse_attributes'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:44:in `initialize'
/home/masa/ywesee/rpdf2txt/lib/rpdf2txt/object.rb:1007:in `initialize'
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `new'
/home/masa/ywesee/rpdf2txt/test/test_pdf_parser.rb:313:in `test_join_snippets__hex_chars'
2) Failure:
test_char_width(TestTextState) [/home/masa/ywesee/rpdf2txt/test/test_text_state.rb:303]:
<0.313> expected but was
<0.301>
diff:
? 0.3013
Finished in 7.296597714 seconds.
121 tests, 277 assertions, 1 failures, 1 errors, 0 pendings, 0 omissions, 0 notifications
98.3471% passed
Note