diff --git a/lib/rpdf2txt-rockit/rockit_grammars_parser.rb b/lib/rpdf2txt-rockit/rockit_grammars_parser.rb index 364e28b..13806b1 100644 --- a/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +++ b/lib/rpdf2txt-rockit/rockit_grammars_parser.rb @@ -6,7 +6,6 @@ module Parse # and licensed under GPL # but this parser is under LGPL tokens = [ - t1 = EofToken.new("EOF",/^(才~中~^^~5348086680)/n), t2 = Token.new("Blank",/^(\s+)/n,:Skip), t3 = Token.new("Comment",/^(#.*$)/n,:Skip), t4 = Token.new("Regexp",/^(\/((\\\/)|[^\/])*\/[iomnux]*)/n), diff --git a/lib/rpdf2txt-rockit/token.rb b/lib/rpdf2txt-rockit/token.rb index bebdfab..2d4e848 100644 --- a/lib/rpdf2txt-rockit/token.rb +++ b/lib/rpdf2txt-rockit/token.rb @@ -135,7 +135,6 @@ class EofToken < Token # Shouldn't match anything but since I'm not sure how to do a regexp # with that chareacteristic we use a highly unlikely string in the mean # time. - super("EOF", "才~中~^^~" + rand(1e10).inspect) end def ==(other) @@ -148,7 +147,6 @@ class EpsilonToken < Token # Shouldn't match anything but since I'm not sure how to do a regexp # with that chareacteristic we use a highly unlikely string in the mean # time. - super("epsilon", "才~中~^^~" + rand(1e10).inspect) end def ==(other) diff --git a/lib/rpdf2txt/data/pdfattributes.rb b/lib/rpdf2txt/data/pdfattributes.rb index 4ec9770..a6663eb 100644 --- a/lib/rpdf2txt/data/pdfattributes.rb +++ b/lib/rpdf2txt/data/pdfattributes.rb @@ -1,12 +1,13 @@ +# encoding: ascii-8bit require 'rpdf2txt-rockit/rockit' module Rpdf2txt # Parser for PdfAttributes - # created by Rockit version 0.3.8 on Tue Aug 11 14:26:52 +0200 2009 + # created by Rockit version 0.3.8 on Mon Jan 17 11:23:17 +0100 2011 # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se # and licensed under GPL # but this parser is under LGPL tokens = [ - t1 = EofToken.new("EOF",/^(才~中~^^~3052319184)/), + t1 = EofToken.new("EOF",/^(才~中~^^~1813729497)/), t2 = Token.new("IDENTIFIER",/^\/[\w:+\#\-\.]*(, ?[\w:+\#\-\.]*)*/n), t3 = Token.new("NUMERIC",/^(-?[0-9]+([.,][0-9]+)?)/n), t4 = Token.new("REFERENCE",/^([0-9]+\s+[0-9]+\s+R)/n), @@ -36,9 +37,9 @@ module Rpdf2txt p10 = Production.new(:Expr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])), p11 = Production.new(:Array,[t9, :ArrayElements, t10],SyntaxTreeBuilder.new("Array",["_", "values", "_"],[])), p12 = Production.new(:Array,[t9, t10],SyntaxTreeBuilder.new("Array",["_", "_"],[])), - p13 = Production.new(:ArrayElements,[:Plus70261701788920],LiftingSyntaxTreeBuilder.new(["values"],[])), - p14 = Production.new(:Plus70261701788920,[:Plus70261701788920, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), - p15 = Production.new(:Plus70261701788920,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), + p13 = Production.new(:ArrayElements,[:Plus70302297656140],LiftingSyntaxTreeBuilder.new(["values"],[])), + p14 = Production.new(:Plus70302297656140,[:Plus70302297656140, :ArrayElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), + p15 = Production.new(:Plus70302297656140,[:ArrayElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), p16 = Production.new(:ArrayElement,[:Array],LiftingSyntaxTreeBuilder.new(["_"],[])), p17 = Production.new(:ArrayElement,[:Hash],LiftingSyntaxTreeBuilder.new(["_"],[])), p18 = Production.new(:ArrayElement,[t3],LiftingSyntaxTreeBuilder.new(["_"],[])), @@ -47,10 +48,10 @@ module Rpdf2txt p21 = Production.new(:ArrayElement,[t5],LiftingSyntaxTreeBuilder.new(["_"],[])), p22 = Production.new(:ArrayElement,[t8],LiftingSyntaxTreeBuilder.new(["_"],[])), p23 = Production.new(:ArrayElement,[:Text],LiftingSyntaxTreeBuilder.new(["_"],[])), - p24 = Production.new(:Hash,[t11, :Mult70261701775380, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])), + p24 = Production.new(:Hash,[t11, :Mult70302297637820, t12],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])), p25 = Production.new(:Hash,[t11, t12],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)), - p26 = Production.new(:Mult70261701775380,[:Mult70261701775380, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)), - p27 = Production.new(:Mult70261701775380,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)), + p26 = Production.new(:Mult70302297637820,[:Mult70302297637820, t2, :Expr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)), + p27 = Production.new(:Mult70302297637820,[t2, :Expr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)), p28 = Production.new(:Date,[t13, t14, t15],SyntaxTreeBuilder.new("Date",["c1", "regexptoken1015925646", "c3"],[])), p29 = Production.new(:Text,[t16],SyntaxTreeBuilder.new("Text",["text"],[])) ] @@ -58,14 +59,14 @@ module Rpdf2txt ] priorities = ProductionPriorities.new(relations) - action_table = [[5, 16, 9, 32768, 13, 1024, 17, 256, 25, 4, 41, 8, 45, 2, 49, 4096, 57, 128], [32, 2051], [112, 65439], [61, 2048, 65, 2], [73, 16, 9, 32768, 13, 1024, 17, 256, 85, 4, 97, 8, 101, 2, 105, 512, 113, 128], [24, 2051], [20, 2051], [12, 2051], [8, 2051], [2, 1], [16, 2051], [4, 2051], [121, 8192], [28, 2051], [36, 2051], [96, 65439], [5, 16, 9, 32768, 13, 1024, 17, 256, 25, 4, 41, 8, 45, 2, 49, 4096, 57, 128], [129, 2048, 133, 2], [80, 65438], [73, 16, 9, 32768, 13, 1024, 17, 256, 85, 4, 97, 8, 101, 2, 113, 128, 48, 512], [88, 65438], [68, 65438], [64, 65438], [60, 65438], [76, 65438], [72, 65438], [44, 65439], [56, 65438], [84, 65438], [141, 512], [145, 16384], [104, 2050], [92, 65439], [5, 16, 9, 32768, 13, 1024, 17, 256, 25, 4, 41, 8, 45, 2, 49, 4096, 57, 128], [52, 65438], [40, 65439], [108, 2051], [100, 2050]] - goto_hash = {16 => {6 => 8, 1 => 31, 2 => 7, 8 => 13, 9 => 5}, 33 => {6 => 8, 1 => 37, 2 => 7, 8 => 13, 9 => 5}, 0 => {6 => 8, 1 => 9, 2 => 7, 8 => 13, 9 => 5}, 19 => {5 => 34, 6 => 22, 2 => 23, 9 => 20}, 3 => {7 => 17}, 4 => {5 => 27, 6 => 22, 2 => 23, 3 => 29, 9 => 20, 4 => 19}} - @@parse_table70261701563520 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[ + action_table = [[5, 128, 25, 32768, 29, 2, 33, 1024, 37, 8, 41, 4096, 49, 4, 53, 16, 57, 256], [36, 2051], [24, 2051], [12, 2051], [8, 2051], [2, 1], [112, 65439], [4, 2051], [65, 2, 69, 2048], [16, 2051], [73, 8192], [28, 2051], [20, 2051], [32, 2051], [77, 128, 85, 512, 25, 32768, 101, 2, 33, 1024, 105, 8, 113, 4, 117, 16, 57, 256], [125, 2, 129, 2048], [5, 128, 25, 32768, 29, 2, 33, 1024, 37, 8, 41, 4096, 49, 4, 53, 16, 57, 256], [96, 65439], [137, 16384], [84, 65438], [88, 65438], [44, 65439], [64, 65438], [60, 65438], [77, 128, 25, 32768, 101, 2, 33, 1024, 105, 8, 113, 4, 117, 16, 57, 256, 48, 512], [72, 65438], [76, 65438], [56, 65438], [68, 65438], [80, 65438], [145, 512], [5, 128, 25, 32768, 29, 2, 33, 1024, 37, 8, 41, 4096, 49, 4, 53, 16, 57, 256], [92, 65439], [104, 2050], [108, 2051], [52, 65438], [40, 65439], [100, 2050]] + goto_hash = {16 => {6 => 4, 1 => 33, 2 => 3, 8 => 11, 9 => 2}, 0 => {6 => 4, 1 => 5, 2 => 3, 8 => 11, 9 => 2}, 24 => {5 => 35, 6 => 22, 2 => 23, 9 => 20}, 8 => {7 => 15}, 14 => {5 => 27, 6 => 22, 2 => 23, 3 => 30, 9 => 20, 4 => 24}, 31 => {6 => 4, 1 => 37, 2 => 3, 8 => 11, 9 => 2}} + @@parse_table70302297779480 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[ :REDUCE, :SHIFT, :ACCEPT ]) def Rpdf2txt._attr_parser - GeneralizedLrParser.new(@@parse_table70261701563520) + GeneralizedLrParser.new(@@parse_table70302297779480) end end diff --git a/lib/rpdf2txt/data/pdftext.rb b/lib/rpdf2txt/data/pdftext.rb index 31122a1..10263bb 100644 --- a/lib/rpdf2txt/data/pdftext.rb +++ b/lib/rpdf2txt/data/pdftext.rb @@ -1,12 +1,12 @@ require 'rpdf2txt-rockit/rockit' module Rpdf2txt # Parser for PdfText - # created by Rockit version 0.3.8 on Thu Oct 01 11:19:33 +0200 2009 + # created by Rockit version 0.3.8 on Mon Jan 17 11:23:17 +0100 2011 # Rockit is copyright (c) 2001 Robert Feldt, feldt@ce.chalmers.se # and licensed under GPL # but this parser is under LGPL tokens = [ - t1 = EofToken.new("EOF",/^(才~中~^^~5511964093)/), + t1 = EofToken.new("EOF",/^(才~中~^^~1714392160)/), t2 = Token.new("NUMERIC",/^(-?(([0-9]*[.,_][0-9]+)|([0-9]+)))/n), t3 = Token.new("SPACE",/^(\s+)/n,:Skip), t4 = Token.new("HEXSNIPPET",/^([0-9A-F]+)/in), @@ -56,9 +56,9 @@ module Rpdf2txt productions = [ p1 = Production.new("Target'".intern,[:Target],SyntaxTreeBuilder.new("Target'",["target"],[])), p2 = Production.new(:Target,[t10, :Exprs, t11],SyntaxTreeBuilder.new("Target",["_", "values", "_"],[])), - p3 = Production.new(:Exprs,[:Plus70032186462260],LiftingSyntaxTreeBuilder.new(["values"],[])), - p4 = Production.new(:Plus70032186462260,[:Plus70032186462260, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), - p5 = Production.new(:Plus70032186462260,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), + p3 = Production.new(:Exprs,[:Plus70302290993460],LiftingSyntaxTreeBuilder.new(["values"],[])), + p4 = Production.new(:Plus70302290993460,[:Plus70302290993460, :Expr],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), + p5 = Production.new(:Plus70302290993460,[:Expr],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), p6 = Production.new(:Expr,[:TmElement],LiftingSyntaxTreeBuilder.new(["val"],[])), p7 = Production.new(:Expr,[:Array],LiftingSyntaxTreeBuilder.new(["val"],[])), p8 = Production.new(:Expr,[:TDElement],LiftingSyntaxTreeBuilder.new(["val"],[])), @@ -90,9 +90,9 @@ module Rpdf2txt p34 = Production.new(:HexElement,[t23, t4, t24],LiftingSyntaxTreeBuilder.new(["_", "hex", "_"],[])), p35 = Production.new(:HexElement,[t23, t24],LiftingSyntaxTreeBuilder.new(["_", "_"],[])), p36 = Production.new(:TjHex,[:HexElement, t22],SyntaxTreeBuilder.new("Tjhex",["hexsnippet", "_"],[])), - p37 = Production.new(:TJArrayElements,[:Plus70032186325260],LiftingSyntaxTreeBuilder.new(["values"],[])), - p38 = Production.new(:Plus70032186325260,[:Plus70032186325260, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), - p39 = Production.new(:Plus70032186325260,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), + p37 = Production.new(:TJArrayElements,[:Plus70302290848760],LiftingSyntaxTreeBuilder.new(["values"],[])), + p38 = Production.new(:Plus70302290848760,[:Plus70302290848760, :TJSingleElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), + p39 = Production.new(:Plus70302290848760,[:TJSingleElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), p40 = Production.new(:TJSingleElement,[t7],SyntaxTreeBuilder.new("TJSingleElement",["snippet"],[])), p41 = Production.new(:TJSingleElement,[t2],SyntaxTreeBuilder.new("TJSingleElement",["kerning"],[])), p42 = Production.new(:TJSingleElement,[:HexElement],SyntaxTreeBuilder.new("TJSingleElement",["hexsnippet"],[])), @@ -107,10 +107,10 @@ module Rpdf2txt p51 = Production.new(:LineWidth,[t2, t33],SyntaxTreeBuilder.new("Width",["width", "_"],[])), p52 = Production.new(:BTElement,[t34],SyntaxTreeBuilder.new("BT",["_"],[])), p53 = Production.new(:ETElement,[t35],SyntaxTreeBuilder.new("ET",["_"],[])), - p54 = Production.new(:Hash,[t36, :Mult70032186252840, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])), + p54 = Production.new(:Hash,[t36, :Mult70302290775040, t37],SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[])), p55 = Production.new(:Hash,[t36, t37],ArrayNodeBuilder.new([],nil,SyntaxTreeBuilder.new("Hash",["_", "pairs", "_"],[]),1,[],true)), - p56 = Production.new(:Mult70032186252840,[:Mult70032186252840, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)), - p57 = Production.new(:Mult70032186252840,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)), + p56 = Production.new(:Mult70302290775040,[:Mult70302290775040, t8, :HashExpr],ArrayNodeBuilder.new([1, 2],0,nil,nil,[],true)), + p57 = Production.new(:Mult70302290775040,[t8, :HashExpr],ArrayNodeBuilder.new([0, 1],nil,nil,nil,[],true)), p58 = Production.new(:HashExpr,[t8],LiftingSyntaxTreeBuilder.new(["val"],[])), p59 = Production.new(:HashExpr,[t2],LiftingSyntaxTreeBuilder.new(["val"],[])), p60 = Production.new(:HashExpr,[t9],LiftingSyntaxTreeBuilder.new(["val"],[])), @@ -122,9 +122,9 @@ module Rpdf2txt p66 = Production.new(:UElement,[t2, t43],SyntaxTreeBuilder.new("UElement",["c1", "regexptoken-265279295"],[])), p67 = Production.new(:UElement,[t2, t2, t2, t44],SyntaxTreeBuilder.new("UElement",["c1", "numeric2", "numeric3", "c4"],[])), p68 = Production.new(:UElement,[t45, t2, t46],SyntaxTreeBuilder.new("UElement",["c1", "numeric", "c3"],[])), - p69 = Production.new(:CNElements,[:Plus70032186212980],SyntaxTreeBuilder.new("CNElements",["plus"],[])), - p70 = Production.new(:Plus70032186212980,[:Plus70032186212980, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), - p71 = Production.new(:Plus70032186212980,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), + p69 = Production.new(:CNElements,[:Plus70302290731320],SyntaxTreeBuilder.new("CNElements",["plus"],[])), + p70 = Production.new(:Plus70302290731320,[:Plus70302290731320, :CNElement],ArrayNodeBuilder.new([1],0,nil,nil,[],true)), + p71 = Production.new(:Plus70302290731320,[:CNElement],ArrayNodeBuilder.new([0],nil,nil,nil,[],true)), p72 = Production.new(:CNElement,[t5],SyntaxTreeBuilder.new("CNElement",["c1"],[])), p73 = Production.new(:CNElement,[t19, t5, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[])), p74 = Production.new(:CNElement,[t19, t5],SyntaxTreeBuilder.new("CNElement",["c1", "alphanumeric", "maybe"],[2])) @@ -133,14 +133,14 @@ module Rpdf2txt ] priorities = ProductionPriorities.new(relations) - action_table = [[5, 512], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416], [2, 1], [284, 2199023517712], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [153, 2097152], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [157, 2199023255552], [36, 70342974373338], [161, 4294967296, 165, 33554432, 169, 65536, 173, 536870912, 177, 4398046511104, 181, 2, 185, 134217728, 189, 1048576, 193, 131072], [24, 70342974373338], [16, 70342974373338], [197, 8, 201, 8388608], [205, 16], [32, 70342974373338], [13, 16, 209, 262144, 272, 2199023255552], [217, 2097152, 221, 1073741824], [233, 2, 69, 4194304, 237, 16777216, 241, 64], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [68, 70342974373338], [13, 16, 57, 2, 69, 4194304, 73, 262144, 85, 64, 89, 2048, 125, 1099511627776, 129, 67108864, 141, 17592186044416, 8, 1024], [257, 1024], [252, 70342974373338], [176, 70342974373338], [56, 70342974373338], [280, 2199023517712], [261, 2], [48, 70342974373338], [44, 70342974373338], [140, 70342974373338], [256, 70342974373338], [200, 70342974373338], [172, 70342974373338], [112, 70342974373338], [188, 70342974373338], [260, 70342974373338], [265, 2, 269, 64, 273, 16384, 277, 32768], [180, 70342974373338], [124, 70342974373338], [116, 70342974373338], [281, 8388608], [136, 31461450], [285, 16, 289, 34359738368, 297, 2, 301, 274877906944, 292, 2199023517712], [305, 16], [276, 2199023517712], [128, 70342974373338], [192, 70342974373338], [164, 29364298], [233, 2, 69, 4194304, 237, 16777216, 241, 64, 144, 4096], [160, 29364298], [168, 29364298], [156, 29364298], [152, 29364298], [313, 4096], [12, 70342974373338], [4, 1], [317, 35184372088832], [321, 8796093022208, 325, 268435456, 329, 2], [333, 2147483648], [104, 70342974373338], [108, 70342974373338], [132, 31461450], [288, 2199023517712], [341, 68719476736, 345, 128], [349, 549755813888], [353, 524288], [244, 70342974373338], [285, 16, 292, 2199023517712], [148, 29364298], [96, 70342974373338], [268, 70342974373338], [264, 70342974373338], [184, 70342974373338], [357, 137438953472, 361, 2], [196, 70342974373338], [365, 68719476736, 369, 128], [216, 549755813888], [373, 2, 381, 128, 385, 256], [248, 70342974373338], [120, 70342974373338], [240, 70342974373338], [389, 2], [212, 549755813888], [373, 2, 381, 128, 385, 256], [232, 68719476864], [224, 68719476864], [228, 68719476864], [236, 68719476864], [397, 8192], [220, 68719476864], [100, 70342974373338]] - goto_hash = {22 => {16 => 62, 17 => 57, 18 => 61, 14 => 56}, 0 => {1 => 2}, 72 => {30 => 84}, 1 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 2 => 30, 24 => 5, 19 => 25, 8 => 19, 3 => 29, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 16, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 51 => {29 => 73}, 29 => {5 => 15, 11 => 37, 22 => 24, 33 => 12, 6 => 27, 12 => 28, 34 => 20, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 24 => 5, 19 => 25, 8 => 19, 25 => 9, 14 => 8, 9 => 13, 20 => 7, 15 => 33, 4 => 63, 26 => 4, 10 => 26, 32 => 23, 21 => 6}, 57 => {18 => 77, 14 => 56}, 86 => {31 => 94}, 20 => {35 => 53}, 92 => {31 => 98}} - @@parse_table70032185719080 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[ + action_table = [[5, 512], [13, 17592186044416, 53, 2, 69, 64, 81, 67108864, 85, 1099511627776, 93, 4194304, 97, 262144, 133, 16, 141, 2048], [2, 1], [153, 2], [88, 70342974373338], [80, 70342974373338], [72, 70342974373338], [60, 70342974373338], [157, 2097152], [84, 70342974373338], [52, 70342974373338], [28, 70342974373338], [161, 2199023255552], [165, 536870912, 169, 134217728, 173, 1048576, 177, 2, 181, 4294967296, 185, 131072, 189, 4398046511104, 193, 65536, 197, 33554432], [36, 70342974373338], [13, 17592186044416, 53, 2, 69, 64, 81, 67108864, 85, 1099511627776, 93, 4194304, 97, 262144, 133, 16, 141, 2048, 8, 1024], [205, 262144, 133, 16, 272, 2199023255552], [213, 2097152, 217, 1073741824], [24, 70342974373338], [16, 70342974373338], [176, 70342974373338], [252, 70342974373338], [32, 70342974373338], [221, 8388608, 225, 8], [229, 16], [92, 70342974373338], [76, 70342974373338], [64, 70342974373338], [40, 70342974373338], [20, 70342974373338], [68, 70342974373338], [233, 1024], [56, 70342974373338], [284, 2199023517712], [280, 2199023517712], [241, 2, 245, 64, 93, 4194304, 249, 16777216], [48, 70342974373338], [44, 70342974373338], [265, 35184372088832], [140, 70342974373338], [256, 70342974373338], [188, 70342974373338], [180, 70342974373338], [124, 70342974373338], [269, 16384, 273, 2, 277, 64, 281, 32768], [200, 70342974373338], [116, 70342974373338], [260, 70342974373338], [112, 70342974373338], [172, 70342974373338], [12, 70342974373338], [285, 16], [276, 2199023517712], [128, 70342974373338], [192, 70342974373338], [136, 31461450], [289, 8388608], [293, 2, 301, 274877906944, 305, 34359738368, 309, 16, 292, 2199023517712], [4, 1], [164, 29364298], [160, 29364298], [156, 29364298], [168, 29364298], [152, 29364298], [313, 4096], [241, 2, 245, 64, 93, 4194304, 249, 16777216, 144, 4096], [268, 70342974373338], [104, 70342974373338], [321, 268435456, 325, 2, 329, 8796093022208], [333, 2147483648], [108, 70342974373338], [309, 16, 292, 2199023517712], [132, 31461450], [337, 524288], [341, 549755813888], [244, 70342974373338], [345, 128, 349, 68719476736], [288, 2199023517712], [96, 70342974373338], [148, 29364298], [184, 70342974373338], [357, 2, 361, 137438953472], [264, 70342974373338], [196, 70342974373338], [120, 70342974373338], [248, 70342974373338], [365, 2, 373, 128, 377, 256], [216, 549755813888], [381, 128, 385, 68719476736], [389, 2], [240, 70342974373338], [232, 68719476864], [224, 68719476864], [228, 68719476864], [236, 68719476864], [365, 2, 373, 128, 377, 256], [212, 549755813888], [397, 8192], [220, 68719476864], [100, 70342974373338]] + goto_hash = {16 => {35 => 52}, 0 => {1 => 2}, 1 => {5 => 18, 11 => 37, 22 => 26, 33 => 12, 6 => 29, 12 => 30, 34 => 16, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 2 => 31, 24 => 5, 19 => 27, 8 => 22, 3 => 15, 25 => 9, 14 => 8, 9 => 14, 20 => 7, 15 => 32, 4 => 19, 26 => 4, 10 => 28, 32 => 25, 21 => 6}, 95 => {31 => 98}, 57 => {29 => 74}, 35 => {16 => 64, 17 => 65, 18 => 63, 14 => 59}, 86 => {31 => 92}, 15 => {5 => 18, 11 => 37, 22 => 26, 33 => 12, 6 => 29, 12 => 30, 34 => 16, 23 => 10, 7 => 11, 13 => 36, 35 => 34, 24 => 5, 19 => 27, 8 => 22, 25 => 9, 14 => 8, 9 => 14, 20 => 7, 15 => 32, 4 => 50, 26 => 4, 10 => 28, 32 => 25, 21 => 6}, 76 => {30 => 88}, 65 => {18 => 79, 14 => 59}} + @@parse_table70302297839580 = ParseTable.new(productions,tokens,priorities,action_table,goto_hash,2,[ :REDUCE, :SHIFT, :ACCEPT ]) def Rpdf2txt._text_parser - GeneralizedLrParser.new(@@parse_table70032185719080) + GeneralizedLrParser.new(@@parse_table70302297839580) end end diff --git a/lib/rpdf2txt/object.rb b/lib/rpdf2txt/object.rb index 4842f2a..40a5668 100644 --- a/lib/rpdf2txt/object.rb +++ b/lib/rpdf2txt/object.rb @@ -27,7 +27,7 @@ require 'rpdf2txt/text' require 'rpdf2txt/attributesparser' require 'rpdf2txt/cmapparser' require 'rpdf2txt/symbol' -require 'md5' +require 'digest/md5' require 'matrix' module Rpdf2txt @@ -442,11 +442,21 @@ module Rpdf2txt yield self end def extract_oids(array) +#print"array encoding=" +#p array.encoding + #array.collect{ |dirty_id| + array = if array.class == String + array.each_line + end + if(array) array.collect{ |dirty_id| if(match = /\d+/on.match(dirty_id)) match[0].to_i end }.compact + else + [] + end end def root? !(@parent || @attributes[:parent]) diff --git a/lib/rpdf2txt/parser.rb b/lib/rpdf2txt/parser.rb index 3b0e252..901a3e3 100644 --- a/lib/rpdf2txt/parser.rb +++ b/lib/rpdf2txt/parser.rb @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# encoding: ascii-8bit # # Rpdf2txt -- PDF to Text Parser # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss @@ -25,7 +26,7 @@ require 'zlib' require 'rpdf2txt/object' require 'rpdf2txt/default_handler' -require 'md5' +require 'digest/md5' module Rpdf2txt VERSION = '0.8.2' @@ -128,7 +129,11 @@ module Rpdf2txt startobj=0 endobj=0 catalogue = {} - @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| + #@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match| +#print "encoding=" +#p @src.encoding +@src.force_encoding('ascii-8bit') + @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/m) do |match| obj = build_object(match.to_s) catalogue.store(obj.oid, obj) end