diff --git a/characterSet.go b/characterSet.go index d3c765bad5bfb4f1f799a12d660dd9a1f8ab4b73..13bd40b1063cd430e82d9361152add569fbb3686 100644 --- a/characterSet.go +++ b/characterSet.go @@ -4,6 +4,7 @@ import ( "encoding/hex" "fmt" "strconv" + "strings" "unicode" ) @@ -65,6 +66,8 @@ func octalToRune(octal []rune) rune { } func hexToString(hexStr string) (string, error) { + // Remove spaces from the hex string to ensure clean decoding + hexStr = strings.ReplaceAll(hexStr, " ", "") if len(hexStr)%2 != 0 { hexStr += "0" diff --git a/characterSet_test.go b/characterSet_test.go index f944a8e9e81b07d8f72acfd09d1bd2987a2d0445..05a941cde08f85c46143ac91602c6c0d70341247 100644 --- a/characterSet_test.go +++ b/characterSet_test.go @@ -58,3 +58,17 @@ func TestIsLeftParenthesis(t *testing.T) { } } } + +func TestHexToString(t *testing.T) { + hexString := "EF BB BF E8 A1 A8 E3 83 9D E3 81 82 41 E9 B7 97 C5 92 C3 A9 EF BC A2 E9 80 8D C3 9C C3 9F C2 AA C4 85 C3 B1 E4 B8 82 E3 90 80 F0 A0 80 80" + expected := "\ufeff表ãƒã‚A鷗ŒéBé€ÃœÃŸÂªÄ…ñ丂ã€ð €€" + result, err := hexToString(hexString) + + if err != nil { + t.Errorf("Expected no error, got %v", err) + } + + if result != expected { + t.Errorf("Expected %q, got %q", expected, result) + } +} diff --git a/constants.go b/constants.go index 8e4c9685900af932b9402aca72ceca976637856d..2977adacfc35c5769debd2e44b850289bbfb6231 100644 --- a/constants.go +++ b/constants.go @@ -1,14 +1,17 @@ package pdf const MagicNumber = "PDF-" -const EndOfFileMagicNumber = "%EOF" +const EndOfFileMagicNumber = "%%EOF" +const keywordXRef = "xref" const keywordStartXRef = "startxref" -const keywordStartObj = "obj" + +// const keywordStartObj = "obj" const keywordEndObj = "endobj" + const keywordStartStream = "stream" const keywordEndStream = "endstream" const keywordStartTrailer = "trailer" -const keywordStartDictionary = "<<" -const keywordEndDictionary = ">>" +//const keywordStartDictionary = "<<" +//const keywordEndDictionary = ">>" diff --git a/error.go b/error.go index e03162093e8107a5080b9ebc79b5ef9ae922f17d..f282ba9c484027ac88d507b7165a06e16022abd0 100644 --- a/error.go +++ b/error.go @@ -3,22 +3,36 @@ package pdf import "errors" var ( - ErrInvalidPDFMagicNumber = errors.New("invalid PDF magic number") - ErrUnexpectedEOF = errors.New("unexpected EOF") - ErrNoRuneToUnread = errors.New("no rune to unread") - ErrInvalidByteOffset = errors.New("invalid byte offset") - ErrorTokenNil = errors.New("token is nil") - ErrTokenAlreadyEmitted = errors.New("token already emitted") - ErrEndOfLineExpected = errors.New("end of line expected") - ErrExpectedEndOfLine = errors.New("expected the end of line") - ErrExpectedDictionary = errors.New("expected dictionary") - ErrUnknownValueType = errors.New("unknown value type") - ErrInvalidNumber = errors.New("invalid number") - ErrInvalidBoolean = errors.New("invalid boolean") - ErrInvalidParenthesesCount = errors.New("invalid parentheses count") - ErrInvalidName = errors.New("invalid name") - ErrInvalidHexDigit = errors.New("invalid hex digit") - errArrayObjectMustStartWithBracket = errors.New("array object must start with a bracket") - errArrayObjectMustEndWithBracket = errors.New("array object must end with a bracket") - ErrInvalidNull = errors.New("invalid null") + ErrInvalidPDFMagicNumber = errors.New("invalid PDF magic number") + ErrUnexpectedEOF = errors.New("unexpected EOF") + ErrNoRuneToUnread = errors.New("no rune to unread") + ErrInvalidByteOffset = errors.New("invalid byte offset") + ErrorTokenNil = errors.New("token is nil") + ErrTokenAlreadyEmitted = errors.New("token already emitted") + ErrEndOfLineExpected = errors.New("end of line expected") + ErrExpectedEndOfLine = errors.New("expected the end of line") + ErrExpectedDictionary = errors.New("expected dictionary") + ErrUnknownValueType = errors.New("unknown value type") + ErrInvalidNumber = errors.New("invalid number") + ErrInvalidBoolean = errors.New("invalid boolean") + ErrInvalidParenthesesCount = errors.New("invalid parentheses count") + ErrInvalidName = errors.New("invalid name") + ErrInvalidHexDigit = errors.New("invalid hex digit") + errArrayObjectMustStartWithBracket = errors.New("array object must start with a bracket") + errArrayObjectMustEndWithBracket = errors.New("array object must end with a bracket") + ErrInvalidNull = errors.New("invalid null") + errDictionaryObjectMustStartWithDoubleLessThanSign = errors.New("dictionary object must start with double less than sign") + errDictionaryObjectMustEndWithDoubleGreaterThanSign = errors.New("dictionary object must End with double greater than sign") + ErrInvalidEndStream = errors.New("invalid end stream") + ErrInvalidObjectLineFormat = errors.New("invalid object line format") + ErrUnexpectedEndOfLine = errors.New("unexpected end of line") + ErrInvalidIndirectObjectIdentifier = errors.New("invalid indirect object identifier") + ErrInvalidIndirectObjectKeyword = errors.New("invalid indirect object keyword") + ErrInvalidIndirectObjectGeneration = errors.New("invalid indirect object generation") + ErrInvalidIndirectEndObjectKeyword = errors.New("invalid indirect end object keyword") + ErrInvalidReferenceObject = errors.New("invalid reference object") + ErrInvalidStartXref = errors.New("invalid startxref") + ErrUnexpectedNullToken = errors.New("unexpected null token") + ErrInvalidXref = errors.New("invalid xref") + ErrInvalidXrefSyntax = errors.New("invalid xref syntax") ) diff --git a/parser_test.go b/parser_test.go index f7f4f5a862bb701dcb4c324c5e395a6f5bfb3413..3e54bc104f901bf956fd66dfe94975bee79f217a 100644 --- a/parser_test.go +++ b/parser_test.go @@ -14,19 +14,33 @@ func TestNewParser(t *testing.T) { func TestParser_Parse(t *testing.T) { - file, err := os.Open("assets/Simple-PDF-2.0-file.pdf") - if err != nil { - t.Errorf("Expected to open the file, got %v", err) - os.Exit(1) + files := []string{ + "pdf20-utf8-test.pdf", + "Simple-PDF-2.0-file.pdf", + "PDF-2.0-image-with-BPC.pdf", + "PDF-2.0-UTF-8-string-and-annotation.pdf", + "PDF-2.0-via-incremental-save.pdf", + "PDF-2.0-with-offset-start.pdf", + "PDF-2.0-with-page-level-output-intent.pdf", } - p := NewParser(file) - p.Parse() + for _, f := range files { + t.Run("TestParser_Parse "+f, func(t *testing.T) { + file, err := os.Open("assets/" + f) + if err != nil { + t.Errorf("Expected to open the file, got %v", err) + return + } - if p.HasError() { - for _, e := range p.GetErrors() { - t.Errorf("Expected no errors, got %v", e) - } + p := NewParser(file) + p.Parse() + + if p.HasError() == true { + for _, e := range p.GetErrors() { + t.Errorf("Expected no errors, got %v", e) + } + } + }) } } diff --git a/stateBeforeDictionaryValue.go b/stateBeforeDictionaryValue.go index 9036c23d8b3a55fbcf06ac0a1b1a62231ff671cb..68a46ec38e7ccfaea4b37703b16caaaaa66b9f61 100644 --- a/stateBeforeDictionaryValue.go +++ b/stateBeforeDictionaryValue.go @@ -17,11 +17,6 @@ func (s beforeDictionaryValueState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { continue } diff --git a/stateBeforeKey.go b/stateBeforeKey.go deleted file mode 100644 index af000cf3321f295c0ece693dc774b242dad1ba4a..0000000000000000000000000000000000000000 --- a/stateBeforeKey.go +++ /dev/null @@ -1,42 +0,0 @@ -package pdf - -type beforeKeyState struct { - token *dictionaryToken -} - -func (s beforeKeyState) process(t *Tokenizer) { - - var c rune - var err error - - for { - - c, err = t.consumeNextRune() - - if err != nil { - t.raiseError(err) - return - } - - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - - if isEndOfLine(c) { - continue - } - - if !isWhitespace(c) { - break - } - } - - if isSolidus(c) { - t.switchState(dictionaryKeyState{ - token: s.token, - }) - } - - return -} diff --git a/stateByteOffsetOflastCrossreferenceSection.go b/stateByteOffsetOflastCrossreferenceSection.go index ce4d3e682b19094e7b4028f1e0e0cb77f15860c5..0a933795240390a26756740d0371076c3611dbfe 100644 --- a/stateByteOffsetOflastCrossreferenceSection.go +++ b/stateByteOffsetOflastCrossreferenceSection.go @@ -3,7 +3,7 @@ package pdf import "strconv" type byteOffsetOfLastCrossReferenceSectionState struct { - token *xrefToken + token *xrefOffsetToken } func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) { @@ -21,11 +21,6 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { o, err := strconv.ParseInt(string(s.token.values), 10, 64) @@ -40,7 +35,7 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) { t.switchState(endOfLineState{ token: &endOfLineToken{}, }) - break + return } if !isNumeric(c) { @@ -51,7 +46,4 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) { s.token.values = append(s.token.values, c) } - - return - } diff --git a/stateByteOffsetOflastCrossreferenceSection_test.go b/stateByteOffsetOflastCrossreferenceSection_test.go index 0a21951fd0bcce1925cdba261162c40f72b97958..f33be05448a630ef269623cb6807d383b6042e0d 100644 --- a/stateByteOffsetOflastCrossreferenceSection_test.go +++ b/stateByteOffsetOflastCrossreferenceSection_test.go @@ -16,14 +16,14 @@ func TestByteOffsetLastCrossReferenceSectionState(t *testing.T) { }{ {"No numeric value", "x", byteOffsetOfLastCrossReferenceSectionState{}, ErrInvalidByteOffset, []tokenInterface{}}, {"numeric value", "1234", byteOffsetOfLastCrossReferenceSectionState{}, io.EOF, []tokenInterface{}}, - {"numeric value with linefeed", "1234\n", endOfLineState{}, io.EOF, []tokenInterface{&xrefToken{}}}, + {"numeric value with linefeed", "1234\n", endOfLineState{}, io.EOF, []tokenInterface{&xrefOffsetToken{}}}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tok := createTestTokenizerWithData(tt.input) tok.state = byteOffsetOfLastCrossReferenceSectionState{ - token: &xrefToken{}, + token: &xrefOffsetToken{}, } // Verarbeite den Tokenizer @@ -40,7 +40,7 @@ func TestByteOffsetLastCrossReferenceSectionState(t *testing.T) { func TestByteOffsetLastCrossReferenceSectionStateValue(t *testing.T) { tok := createTestTokenizerWithData("1\n") tok.state = byteOffsetOfLastCrossReferenceSectionState{ - token: &xrefToken{}, + token: &xrefOffsetToken{}, } // Verarbeite den Tokenizer @@ -48,10 +48,10 @@ func TestByteOffsetLastCrossReferenceSectionStateValue(t *testing.T) { checkStateAgainstDef(t, tok, endOfLineState{}) checkErrors(t, tok, io.EOF) - checkTokenStack(t, tok, []tokenInterface{&xrefToken{}}) + checkTokenStack(t, tok, []tokenInterface{&xrefOffsetToken{}}) - if tok.getLastToken().(*xrefToken).byteOffsetXRef != 1 { - t.Errorf("Expected byte offset to be 1, got %d", tok.getLastToken().(*xrefToken).byteOffsetXRef) + if (*tok.peekToken()).(*xrefOffsetToken).byteOffsetXRef != 1 { + t.Errorf("Expected byte offset to be 1, got %d", (*tok.peekToken()).(*xrefOffsetToken).byteOffsetXRef) } } diff --git a/stateComment.go b/stateComment.go index c0c05f4dc332cac8095cab30033e296860935962..c3aef3bd58dc5c922ce0c9fdba78d688e89eba66 100644 --- a/stateComment.go +++ b/stateComment.go @@ -29,19 +29,19 @@ func (s commentState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.emitToken(s.token) - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { t.emitToken(s.token) + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(endOfLineState{ token: &endOfLineToken{}, }) - break + return } s.token.values = append(s.token.values, c) diff --git a/stateDictionary.go b/stateDictionary.go deleted file mode 100644 index bb47c501378035b555353e8248051d5c7445d6d1..0000000000000000000000000000000000000000 --- a/stateDictionary.go +++ /dev/null @@ -1,13 +0,0 @@ -package pdf - -type dictionaryToken struct { - tokenInternal -} - -type dictionaryState struct { - token *dictionaryToken -} - -func (s dictionaryState) process(t *Tokenizer) { - -} diff --git a/stateDictionaryKey.go b/stateDictionaryKey.go deleted file mode 100644 index 117dfbc8518e14e01f5b10c9f3a069c3172336fb..0000000000000000000000000000000000000000 --- a/stateDictionaryKey.go +++ /dev/null @@ -1,46 +0,0 @@ -package pdf - -type keywordToken struct { - tokenInternal -} - -type dictionaryKeyState struct { - token *dictionaryToken -} - -func (s dictionaryKeyState) process(t *Tokenizer) { - - var c rune - var err error - - var key string - - for { - - c, err = t.consumeNextRune() - - if err != nil { - t.raiseError(err) - return - } - - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - - if isEndOfLine(c) { - t.raiseError(ErrExpectedDictionary) - return - } - - if isWhitespace(c) { - - t.switchState(beforeDictionaryValueState{}) - return - } - - key += string(c) - - } -} diff --git a/stateEndDictionaryObject.go b/stateEndDictionaryObject.go new file mode 100644 index 0000000000000000000000000000000000000000..3e595806c331e18bcb10a97626a8c6eab12c93c1 --- /dev/null +++ b/stateEndDictionaryObject.go @@ -0,0 +1,28 @@ +package pdf + +type endDictionaryObjectToken struct { + tokenInternal +} + +type endDictionaryObjectState struct { + token *endDictionaryObjectToken +} + +func (s endDictionaryObjectState) process(t *Tokenizer) { + var err error + var r []rune + + r, err = t.consumeNextRunes(2) + if err != nil { + t.raiseError(err) + return + } + + if len(r) != 2 || !isGreaterThanSign(r[0]) || !isGreaterThanSign(r[1]) { + t.raiseError(errDictionaryObjectMustEndWithDoubleGreaterThanSign) + return + } + + t.emitToken(s.token) + +} diff --git a/stateEndOfFile.go b/stateEndOfFile.go index 43ee9292e702a4a68e8ca8b60cdf09dfd8be8f4d..6beecc257f2739e80e1fe3de0c86cd9350eac704 100644 --- a/stateEndOfFile.go +++ b/stateEndOfFile.go @@ -21,11 +21,6 @@ func (s endOfFileState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if c == 'E' || c == 'O' || c == 'F' || isPercentSign(c) { s.token.values = append(s.token.values, c) @@ -42,11 +37,19 @@ func (s endOfFileState) process(t *Tokenizer) { } if string(s.token.values) != EndOfFileMagicNumber { - err := t.unreadCount(len(s.token.values)) - if err != nil { - t.raiseError(err) + + if s.token.values == nil { return } + + l := len(s.token.values) + if l > 0 { + err := t.unreadCount(l) + if err != nil { + t.raiseError(err) + return + } + } return } diff --git a/stateEndOfFile_test.go b/stateEndOfFile_test.go index e33a62da0542fcaf411e677524f0122b7a615880..154ba30d57436d828cb4141b6055ef3b8eb708f2 100644 --- a/stateEndOfFile_test.go +++ b/stateEndOfFile_test.go @@ -16,7 +16,7 @@ func TestEndOfFileState(t *testing.T) { }{ {"Empty File", "", endOfFileState{}, io.EOF, []tokenInterface{}}, {"End of file", "EOF", endOfFileState{}, io.EOF, []tokenInterface{}}, - {"End of file with end of line", "%EOF\n", endOfLineState{}, nil, []tokenInterface{&endOfFileToken{}, &endOfLineToken{}}}, + {"End of file with end of line", "%%EOF\n", endOfLineState{}, nil, []tokenInterface{&endOfFileToken{}, &endOfLineToken{}}}, } for _, tt := range tests { @@ -44,7 +44,7 @@ func TestEndOfFileTokenIsNil(t *testing.T) { // Verarbeite den Tokenizer tok.state.process(tok) - if !tok.HasError() { + if !tok.hasError() { t.Errorf("Expected error, got none") } } diff --git a/stateEndOfIndirectObject.go b/stateEndOfIndirectObject.go new file mode 100644 index 0000000000000000000000000000000000000000..971151908f0e538fbbd46f8ac01988ef3e81d6ce --- /dev/null +++ b/stateEndOfIndirectObject.go @@ -0,0 +1,46 @@ +package pdf + +type endOfIndirectObjectToken struct { + tokenInternal +} + +type endOfIndirectObjectState struct { + token *endOfIndirectObjectToken +} + +func (s endOfIndirectObjectState) process(t *Tokenizer) { + + var c rune + var err error + var value string + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 'e' && c != 'n' && c != 'd' && c != 'o' && c != 'b' && c != 'j' { + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + break + } + + value += string(c) + } + + if value != keywordEndObj { + t.raiseError(ErrInvalidIndirectEndObjectKeyword) + return + } + + t.emitToken(s.token) + + return +} diff --git a/stateEndOfIndirectObject_test.go b/stateEndOfIndirectObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..3e52ca189e083671f333cb7ed8bb4478515b042c --- /dev/null +++ b/stateEndOfIndirectObject_test.go @@ -0,0 +1,38 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestEndIndirectObjectStat_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", endOfIndirectObjectState{}, io.EOF, []tokenInterface{}}, + {"emd of stream", "endobj ", + endOfIndirectObjectState{}, nil, []tokenInterface{&endOfIndirectObjectToken{}}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = endOfIndirectObjectState{ + token: &endOfIndirectObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateEndOfLine_test.go b/stateEndOfLine_test.go index 6e77439f1684a6851e60cfb875a694d2c0f4a214..e80b242726143d82495633daf12a2b77b1b36b8d 100644 --- a/stateEndOfLine_test.go +++ b/stateEndOfLine_test.go @@ -41,7 +41,7 @@ func TestEndOfLineTokenIsNil(t *testing.T) { // Verarbeite den Tokenizer tok.state.process(tok) - if !tok.HasError() { + if !tok.hasError() { t.Errorf("Expected error, got none") } } diff --git a/stateEndStreamObject.go b/stateEndStreamObject.go new file mode 100644 index 0000000000000000000000000000000000000000..d4a33c762d482f1c4b9e78eae1a07d6cc7914b99 --- /dev/null +++ b/stateEndStreamObject.go @@ -0,0 +1,27 @@ +package pdf + +type endStreamObjectToken struct { + tokenInternal +} + +type endStreamObjectState struct { + token *endStreamObjectToken +} + +func (s endStreamObjectState) process(t *Tokenizer) { + var r []rune + var err error + + r, err = t.consumeNextRunes(9) + if err != nil { + t.raiseError(err) + return + } + + if string(r) != keywordEndStream { + t.raiseError(ErrInvalidEndStream) + return + } + + t.emitToken(s.token) +} diff --git a/stateEndStreamObject_test.go b/stateEndStreamObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..dc58775c26e4a6f1907b6f9990d0502c2475311f --- /dev/null +++ b/stateEndStreamObject_test.go @@ -0,0 +1,38 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestEndStreamObjectStat_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", endStreamObjectState{}, io.EOF, []tokenInterface{}}, + {"emd of stream", "endstream", + endStreamObjectState{}, nil, []tokenInterface{&endStreamObjectToken{}}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = endStreamObjectState{ + token: &endStreamObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateFileHeader.go b/stateFileHeader.go index 8163dd8dbd20276c704860650212d62d433ff5dc..c45f62a11b25f0bfca6017482a3db9bf4663904c 100644 --- a/stateFileHeader.go +++ b/stateFileHeader.go @@ -17,12 +17,6 @@ func (s fileHeaderState) process(t *Tokenizer) { return } - // End of the file - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if c == 'P' || c == 'D' || c == 'F' || c == '-' { s.token.values = append(s.token.values, c) continue diff --git a/stateFileHeader_test.go b/stateFileHeader_test.go index e699d2967050fa40c056882ba67993292da0de69..d9f2a6913d132df71413b323e90037a7aa7d09eb 100644 --- a/stateFileHeader_test.go +++ b/stateFileHeader_test.go @@ -42,7 +42,7 @@ func TestFileHeaderTokenIsNil(t *testing.T) { // Verarbeite den Tokenizer tok.state.process(tok) - if !tok.HasError() { + if !tok.hasError() { t.Errorf("Expected error, got none") } } diff --git a/stateIdentifyLineType.go b/stateIdentifyLineType.go index bbe092aebdc200efebae8a175d8c18a0c59bf438..1b109c7b53113e2909338dde376f98a0f36eac86 100644 --- a/stateIdentifyLineType.go +++ b/stateIdentifyLineType.go @@ -1,30 +1,101 @@ package pdf -import "fmt" +import ( + "strings" +) -type identifyLineTypeState struct { -} +type identifyLineTypeState struct{} func (s identifyLineTypeState) process(t *Tokenizer) { buffer := "" for { + c, err := t.consumeNextRune() if err != nil { t.raiseError(err) return } - if isEndOfFile(c) { - return - } - if isEndOfLine(c) { - t.switchState(endOfLineState{ - token: &endOfLineToken{}, - }) - break + + switch { + case buffer == keywordStartXRef: + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(readXrefOffsetState{ + token: &xrefOffsetToken{}, + }) + + t.switchState(endOfFileState{ + token: &endOfFileToken{}, + }) + + return + + case buffer == keywordXRef: + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(readXrefState{ + token: &xrefToken{}, + }) + + return + + case buffer == keywordStartTrailer: + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(trailerStartState{ + token: &trailerToken{}, + }) + + return + case strings.TrimSpace(buffer) == "": + + t.switchState(endOfLineState{ + token: &endOfLineToken{}, + }) + return + + default: + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + err = t.unreadCount(len(buffer)) + if err != nil { + t.raiseError(err) + return + } + + t.switchState(readIndirectObjectState{ + token: &indirectObjectToken{}, + }) + + t.switchState(endOfLineState{ + token: &endOfLineToken{}, + }) + return + + } } if isPercentSign(c) { @@ -44,30 +115,9 @@ func (s identifyLineTypeState) process(t *Tokenizer) { }) continue - } buffer += string(c) - switch buffer { - case keywordStartObj: - fmt.Println("OBJ") - case keywordEndObj: - fmt.Println("ENDOBJ") - case keywordStartStream: - fmt.Println("STREAM") - case keywordEndStream: - fmt.Println("ENDSTREAM") - case keywordStartXRef: - t.switchState(xrefState{ - token: &xrefToken{}, - }) - case keywordStartTrailer: - t.switchState(trailerStartState{ - token: &trailerToken{}, - }) - - } - } } diff --git a/stateIdentifyLineType_test.go b/stateIdentifyLineType_test.go index 22a54f6c9aab3afdd28f1eea24c58e1bae97f6f2..41eb33edf9ad21a21268f5a27c398f89f48043c1 100644 --- a/stateIdentifyLineType_test.go +++ b/stateIdentifyLineType_test.go @@ -1,7 +1,6 @@ package pdf import ( - "io" "testing" ) @@ -14,7 +13,145 @@ func TestIdentifyLineTypeState_Process(t *testing.T) { expectedError error expectTokens []tokenInterface }{ - {"Simple Trailer example", "trailer\n<<\n/Size 22\n/Root 1 0 R\n/Info 2 0 R\n>>\nstartxref\n123456\n%%EOF\n", whitespaceState{}, io.EOF, []tokenInterface{&whitespaceToken{}}}, + {"Simple Trailer example", "trailer\n<<\n/Size 22\n/Root 1 0 R\n/Info 2 0 R\n>>\nstartxref\n123456\n%%EOF\n", + endOfLineState{}, + nil, + []tokenInterface{ + &trailerToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &startStartxrefToken{}, + &endOfLineToken{}, + &xrefOffsetToken{}, + &endOfLineToken{}, + &endOfFileToken{}, + &endOfLineToken{}, + }}, + {"Simple example with comment", "4 0 obj\n<<\n /Type /Page\n /Parent 3 0 R\n /MediaBox [0 0 612 396]\n /Contents [5 0 R 6 0 R]\n /Resources <<\n /Font << /F1 7 0 R >>\n >>\n>>\nendobj\n", + endOfLineState{}, + nil, + []tokenInterface{ + &indirectObjectToken{}, + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &arrayStartToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &endArrayObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &arrayStartToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &endArrayObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &endOfIndirectObjectToken{}, + &endOfLineToken{}, + }, + }, + {"Simple example with stream", "6 0 obj\n<< /Length 165 >>\nstream\n% A text block that shows \"Hello World\"\n% No color is set, so this defaults to black in DeviceGray colorspace\nBT\n /F1 24 Tf\n 100 100 Td\n (Hello World) Tj\nET\nendstream\nendobj\n", + endOfLineState{}, + nil, + []tokenInterface{ + + &indirectObjectToken{}, + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &startStreamObjectToken{}, + &streamObjectToken{}, + &endStreamObjectToken{}, + &endOfIndirectObjectToken{}, + &endOfLineToken{}, + }, + }, } for _, tt := range tests { diff --git a/stateInitial.go b/stateInitial.go index 0d31c6d9192a74ac1d8621545bc9ee2c4c63e069..5f74d6715e449502fb18851a49d7fa80dd499b50 100644 --- a/stateInitial.go +++ b/stateInitial.go @@ -1,7 +1,5 @@ package pdf -import "io" - type initialState struct{} func (s initialState) process(t *Tokenizer) { @@ -36,11 +34,6 @@ func (s initialState) process(t *Tokenizer) { break } - if isEndOfFile(c) { - t.raiseError(io.EOF) - return - } - } } diff --git a/stateNewline.go b/stateNewline.go index 19db0d1af7aeeff9bbe90e23eef3a001112c79e0..5273c6c655d3d4c578c463531380a8f3ea5d28ba 100644 --- a/stateNewline.go +++ b/stateNewline.go @@ -10,10 +10,6 @@ func (s newlineState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - return - } - if isEndOfLine(c) { t.switchState(endOfLineState{ token: &endOfLineToken{}, diff --git a/statePDFVersion.go b/statePDFVersion.go index 6adae4352b3485f3fc36ffdcc04ac4bccfcb6618..8fed94fffb7105792347b732cff35be14ff486ba 100644 --- a/statePDFVersion.go +++ b/statePDFVersion.go @@ -24,11 +24,6 @@ func (s pdfVersionState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { t.emitToken(s.token) diff --git a/stateReadBooleanObject.go b/stateReadBooleanObject.go index deaa733ad49754f182eb6860107da19af794eea9..5612e219cc131129514fbf955609749342e7df05 100644 --- a/stateReadBooleanObject.go +++ b/stateReadBooleanObject.go @@ -23,11 +23,6 @@ func (s readBooleanObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if c != 'f' && c != 'a' && c != 'l' && c != 's' && c != 'e' && c != 't' && c != 'r' && c != 'u' { break } @@ -40,8 +35,13 @@ func (s readBooleanObjectState) process(t *Tokenizer) { return } + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + s.token.value = value t.emitToken(s.token) - return } diff --git a/stateReadIndirectObject.go b/stateReadIndirectObject.go new file mode 100644 index 0000000000000000000000000000000000000000..60d33dd122b4a285b49bb3ee832109293f4fe9a0 --- /dev/null +++ b/stateReadIndirectObject.go @@ -0,0 +1,114 @@ +package pdf + +type indirectObjectToken struct { + tokenInternal +} + +type readIndirectObjectState struct { + token *indirectObjectToken +} + +func (s readIndirectObjectState) process(t *Tokenizer) { + + var c rune + var err error + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + switch { + case isWhitespace(c): + + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + + case isNumeric(c): + + t.emitToken(s.token) + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + p, err := t.peekNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isWhitespace(p) { + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + } + + t.switchState(readIndirectObjectIdentifierState{ + token: &indirectObjectIdentifierToken{}, + }) + + for { + p, err = t.peekNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isWhitespace(p) { + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + } + + t.switchState(startValueState{}) + + p, err = t.peekNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isWhitespace(p) { + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + } + + t.switchState(endOfIndirectObjectState{ + token: &endOfIndirectObjectToken{}, + }) + + p := *t.peekToken() + if p == nil { + t.raiseError(ErrUnexpectedNullToken) + return + } + + if _, ok := p.(*endOfIndirectObjectToken); ok { + return + } + + t.removeLastError() + + } + + default: + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.raiseError(ErrInvalidIndirectObjectIdentifier) + return + } + + } + +} diff --git a/stateReadIndirectObjectGeneration.go b/stateReadIndirectObjectGeneration.go new file mode 100644 index 0000000000000000000000000000000000000000..8410a4772bfcab35e6cbe98a4e27feb564c90a32 --- /dev/null +++ b/stateReadIndirectObjectGeneration.go @@ -0,0 +1,57 @@ +package pdf + +import "strconv" + +type indirectObjectGenerationToken struct { + tokenInternal + value int +} + +type readIndirectObjectGenerationState struct { + token *indirectObjectGenerationToken +} + +func (s readIndirectObjectGenerationState) process(t *Tokenizer) { + + var c rune + var err error + + generation := "" + + for { + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isEndOfLine(c) { + t.raiseError(ErrUnexpectedEndOfLine) + return + } + + if isWhitespace(c) { + + v, err := strconv.Atoi(generation) + if err != nil { + t.raiseError(err) + return + } + s.token.value = v + t.emitToken(s.token) + t.switchState(readIndirectObjectKeywordState{ + token: &indirectObjectKeywordToken{}, + }) + return + } + + if !isNumeric(c) { + t.raiseError(ErrInvalidIndirectObjectGeneration) + return + } + + generation += string(c) + + } + +} diff --git a/stateReadIndirectObjectGeneration_test.go b/stateReadIndirectObjectGeneration_test.go new file mode 100644 index 0000000000000000000000000000000000000000..43056877a43a86aa14333a7f3fd98f43d3f8557e --- /dev/null +++ b/stateReadIndirectObjectGeneration_test.go @@ -0,0 +1,50 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestReadIndirectObjectGenerationState_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + expectedValue int + }{ + {"empty", "", readIndirectObjectGenerationState{}, io.EOF, []tokenInterface{}, 0}, + {"Generation", "12", readIndirectObjectGenerationState{}, io.EOF, []tokenInterface{}, 0}, + {"Generation with space", "12 obj ", readIndirectObjectKeywordState{}, nil, []tokenInterface{&indirectObjectGenerationToken{}, &indirectObjectKeywordToken{}}, 12}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = readIndirectObjectGenerationState{ + token: &indirectObjectGenerationToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + if tt.expectedValue > 0 { + st, ok := tok.tokenStack[0].(*indirectObjectGenerationToken) + if !ok { + t.Errorf("state is not readIndirectObjectGenerationState") + return + } + + if st.value != tt.expectedValue { + t.Errorf("expected value %d, got %d", tt.expectedValue, st.value) + } + } + }) + } +} diff --git a/stateReadIndirectObjectIdentifier.go b/stateReadIndirectObjectIdentifier.go new file mode 100644 index 0000000000000000000000000000000000000000..df11c62e7c0d7a226638aa596bac15df55182923 --- /dev/null +++ b/stateReadIndirectObjectIdentifier.go @@ -0,0 +1,57 @@ +package pdf + +import "strconv" + +type indirectObjectIdentifierToken struct { + tokenInternal + value int +} + +type readIndirectObjectIdentifierState struct { + token *indirectObjectIdentifierToken +} + +func (s readIndirectObjectIdentifierState) process(t *Tokenizer) { + + var c rune + var err error + + generation := "" + + for { + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isEndOfLine(c) { + t.raiseError(ErrUnexpectedEndOfLine) + return + } + + if isWhitespace(c) { + + v, err := strconv.Atoi(generation) + if err != nil { + t.raiseError(err) + return + } + s.token.value = v + t.emitToken(s.token) + t.switchState(readIndirectObjectGenerationState{ + token: &indirectObjectGenerationToken{}, + }) + return + } + + if !isNumeric(c) { + t.raiseError(ErrInvalidIndirectObjectIdentifier) + return + } + + generation += string(c) + + } + +} diff --git a/stateReadIndirectObjectIdentifier_test.go b/stateReadIndirectObjectIdentifier_test.go new file mode 100644 index 0000000000000000000000000000000000000000..ba7495b5ba3376889383d8794c7e153681cb71a9 --- /dev/null +++ b/stateReadIndirectObjectIdentifier_test.go @@ -0,0 +1,55 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestReadIndirectObjectIdentifierState_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + expectedValue int + }{ + {"empty", "", readIndirectObjectIdentifierState{}, io.EOF, []tokenInterface{}, 0}, + {"ID", "12", readIndirectObjectIdentifierState{}, io.EOF, []tokenInterface{}, 0}, + {"ID with space", "23 12 obj ", readIndirectObjectKeywordState{}, nil, + []tokenInterface{ + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}}, + 23}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = readIndirectObjectIdentifierState{ + token: &indirectObjectIdentifierToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + if tt.expectedValue > 0 { + st, ok := tok.tokenStack[0].(*indirectObjectIdentifierToken) + if !ok { + t.Errorf("state is not readIndirectObjectIdentifierState") + return + } + + if st.value != tt.expectedValue { + t.Errorf("expected value %d, got %d", tt.expectedValue, st.value) + } + } + }) + } +} diff --git a/stateReadIndirectObjectKeyword.go b/stateReadIndirectObjectKeyword.go new file mode 100644 index 0000000000000000000000000000000000000000..5f0ba3da71bbfec88be1ee35d80c639856a865c6 --- /dev/null +++ b/stateReadIndirectObjectKeyword.go @@ -0,0 +1,43 @@ +package pdf + +type indirectObjectKeywordToken struct { + tokenInternal + value string +} + +type readIndirectObjectKeywordState struct { + token *indirectObjectKeywordToken +} + +func (s readIndirectObjectKeywordState) process(t *Tokenizer) { + + var c rune + var err error + var value string + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 'o' && c != 'b' && c != 'j' { + break + } + + value += string(c) + } + + if value != "obj" { + t.raiseError(ErrInvalidIndirectObjectKeyword) + return + } + + s.token.value = value + t.emitToken(s.token) + + return + +} diff --git a/stateReadIndirectObjectKeyword_test.go b/stateReadIndirectObjectKeyword_test.go new file mode 100644 index 0000000000000000000000000000000000000000..c213989cdbc43b6d75baa0bd1832c5c764c63c87 --- /dev/null +++ b/stateReadIndirectObjectKeyword_test.go @@ -0,0 +1,51 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestReadIndirectObjectKeywordState_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + expectedValue string + }{ + {"empty", "", readIndirectObjectKeywordState{}, io.EOF, []tokenInterface{}, ""}, + {"obj keyword", "obj", readIndirectObjectKeywordState{}, io.EOF, []tokenInterface{}, ""}, + {"obj keyword with space", "obj ", readIndirectObjectKeywordState{}, nil, []tokenInterface{&indirectObjectKeywordToken{}}, "obj"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = readIndirectObjectKeywordState{ + token: &indirectObjectKeywordToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + if len(tt.expectTokens) > 0 { + st, ok := tok.state.(readIndirectObjectKeywordState) + if !ok { + t.Errorf("state is not readNameObjectState") + return + } + + if st.token.value != tt.expectedValue { + t.Errorf("expected value %s, got %s", tt.expectedValue, st.token.value) + } + } + + }) + } +} diff --git a/stateReadIndirectObject_test.go b/stateReadIndirectObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..a7cacff432b2d26643481fcce7ae0daffa78925f --- /dev/null +++ b/stateReadIndirectObject_test.go @@ -0,0 +1,114 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestIndirectObjectState(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + expectedID int + expectedGen int + expectedType string + }{ + {"empty", "", readIndirectObjectState{}, io.EOF, []tokenInterface{}, 0, 0, ""}, + {"Example from Documentation 7.3.10", "12 0 obj\n(Brillig)\nendobj ", endOfIndirectObjectState{}, nil, + []tokenInterface{ + &indirectObjectToken{}, + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}, + &stringToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &endOfIndirectObjectToken{}, + }, 12, 0, "obj"}, + {"Example from Simple PDF File", "8 0 obj\n[ 278 556\n 556 1015\n 667 667\n 778 222\n 556 56\n 556 584 ]\nendobj\n\n", + endOfIndirectObjectState{}, + nil, []tokenInterface{ + &indirectObjectToken{}, + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}, + &arrayStartToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &endArrayObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &endOfIndirectObjectToken{}, + }, 0, 0, ""}, + {"Example from Simple PDF File", "1 0 obj\n<<\n /Type /Catalog\n /Lang (en-UK) % Default language\n\n>>\nendobj\n", + endOfIndirectObjectState{}, + nil, []tokenInterface{ + + &indirectObjectToken{}, + &indirectObjectIdentifierToken{}, + &indirectObjectGenerationToken{}, + &indirectObjectKeywordToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &commentToken{}, + &endOfLineToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &endOfIndirectObjectToken{}, + }, 1, 0, "obj"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = readIndirectObjectState{ + token: &indirectObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateReadNameObject.go b/stateReadNameObject.go index 579bc2d8a74d8f70c71f09f6734c801fea18c581..69d61daa51583dd49cd65a2e9356c374143fafdd 100644 --- a/stateReadNameObject.go +++ b/stateReadNameObject.go @@ -26,11 +26,6 @@ func (s readNameObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - // The SOLIDUS is not part of the name if !isSolidus(c) { t.raiseError(ErrInvalidName) diff --git a/stateReadNullObject.go b/stateReadNullObject.go index 4960df72c271464455a1c9c11057d278f38559d9..50fbb41d9eda3d2b6985d65fcbd5dba179fb44ec 100644 --- a/stateReadNullObject.go +++ b/stateReadNullObject.go @@ -23,11 +23,6 @@ func (s readNullObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if c != 'n' && c != 'u' && c != 'l' { break } diff --git a/stateReadNumberObject.go b/stateReadNumberObject.go index 925ae18ceaeaaddb565e6ae2ca91ad1112332732..c6f569b67ed95c23e13a3d3742ab605b052c200e 100644 --- a/stateReadNumberObject.go +++ b/stateReadNumberObject.go @@ -32,11 +32,6 @@ func (s readNumberObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if c == '.' && !hasDot { hasDot = true value += string(c) @@ -51,7 +46,7 @@ func (s readNumberObjectState) process(t *Tokenizer) { } firstRuneFlag = false - if !isNumeric(c) { + if isWhitespace(c) || isEndOfLine(c) || isDelimiter(c) { err := t.unread() if err != nil { t.raiseError(err) @@ -60,6 +55,17 @@ func (s readNumberObjectState) process(t *Tokenizer) { break } + if !isNumeric(c) { + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.raiseError(ErrInvalidNumber) + return + } + value += string(c) } diff --git a/stateReadNumberObject_test.go b/stateReadNumberObject_test.go index 01335c531d9e1defb2319a0a3a6831ee94c9de92..60956bdfa6cdbd2dcdb467034f7d5eaa1e46b6eb 100644 --- a/stateReadNumberObject_test.go +++ b/stateReadNumberObject_test.go @@ -17,15 +17,17 @@ func TestReadNumberObjectStateProcess(t *testing.T) { {"empty", "", readNumberObjectState{}, io.EOF, []tokenInterface{}}, {"simple integer", "1", readNumberObjectState{}, io.EOF, []tokenInterface{}}, {"Not a Number", "A", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, - {"Number plus Char ", "1A", readNumberObjectState{}, nil, []tokenInterface{}}, + {"Number plus Char ", "1A", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, {"multiple plus", "++", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, {"multiple minus", "--", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, {"dot and plus", ".+", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, {"float", "1.", readNumberObjectState{}, io.EOF, []tokenInterface{}}, {"float dot number", "1.0", readNumberObjectState{}, io.EOF, []tokenInterface{}}, + {"float dot number 1", "1.0 ", readNumberObjectState{}, nil, []tokenInterface{&numberToken{}}}, + {"float dot number 2", "1.0\n", readNumberObjectState{}, nil, []tokenInterface{&numberToken{}}}, {"dot number", ".0", readNumberObjectState{}, io.EOF, []tokenInterface{}}, - {"dot number linefeed", ".0\n", readNumberObjectState{}, nil, []tokenInterface{}}, - {"dot number dot", ".0.", readNumberObjectState{}, nil, []tokenInterface{}}, + {"dot number linefeed", ".0\n", readNumberObjectState{}, nil, []tokenInterface{&numberToken{}}}, + {"dot number dot", ".0.", readNumberObjectState{}, ErrInvalidNumber, []tokenInterface{}}, } for _, tt := range tests { diff --git a/stateReadReferenceObject.go b/stateReadReferenceObject.go new file mode 100644 index 0000000000000000000000000000000000000000..46c0b0792954b2e20652124ec5501c07e4094260 --- /dev/null +++ b/stateReadReferenceObject.go @@ -0,0 +1,28 @@ +package pdf + +type referenceObjectToken struct { + tokenInternal +} + +type readReferenceObjectState struct { + token *referenceObjectToken +} + +func (s readReferenceObjectState) process(t *Tokenizer) { + + var c rune + var err error + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 'R' { + t.raiseError(ErrInvalidReferenceObject) + return + } + + t.emitToken(s.token) +} diff --git a/stateStartDictionary_test.go b/stateReadReferenceObject_test.go similarity index 62% rename from stateStartDictionary_test.go rename to stateReadReferenceObject_test.go index e342be5003f614de0b5f9834b0e2bfbb377a9d9d..1e0400e64978e4128a2d09606c51177c44890463 100644 --- a/stateStartDictionary_test.go +++ b/stateReadReferenceObject_test.go @@ -1,10 +1,11 @@ package pdf import ( + "io" "testing" ) -func TestStartDictionaryState_Process(t *testing.T) { +func TestStateReadReferenceObject(t *testing.T) { tests := []struct { name string @@ -13,14 +14,15 @@ func TestStartDictionaryState_Process(t *testing.T) { expectedError error expectTokens []tokenInterface }{ - {"No end of line", "x", xrefState{}, ErrEndOfLineExpected, []tokenInterface{}}, + {"empty", "", readReferenceObjectState{}, io.EOF, []tokenInterface{}}, + {"Reference", "R", readReferenceObjectState{}, nil, []tokenInterface{&referenceObjectToken{}}}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tok := createTestTokenizerWithData(tt.input) - tok.state = xrefState{ - token: &xrefToken{}, + tok.state = readReferenceObjectState{ + token: &referenceObjectToken{}, } // Verarbeite den Tokenizer diff --git a/stateReadStreamObject.go b/stateReadStreamObject.go new file mode 100644 index 0000000000000000000000000000000000000000..f49cf9f3870fff64bb85b8db5b0c233ece72fabf --- /dev/null +++ b/stateReadStreamObject.go @@ -0,0 +1,57 @@ +package pdf + +type streamObjectToken struct { + tokenInternal + values []rune +} + +type readStreamObjectState struct { + token *streamObjectToken +} + +func (s readStreamObjectState) process(t *Tokenizer) { + + var r []rune + + // store last 9 runes to check if we are at the end of the stream + var last9Runes []rune + + for { + + c, err := t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + last9Runes = append(last9Runes, c) + if len(last9Runes) > 9 { + last9Runes = last9Runes[1:] + } + + if string(last9Runes) == keywordEndStream { + + r = r[:len(r)-9] + err := t.unreadCount(9) + if err != nil { + t.raiseError(err) + return + } + + s.token.values = r + t.emitToken(s.token) + + t.switchState(endStreamObjectState{ + token: &endStreamObjectToken{ + tokenInternal: tokenInternal{}, + }, + }) + + return + } + + r = append(r, c) + + } + +} diff --git a/stateReadStreamObject_test.go b/stateReadStreamObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..c012933f90a7725fe95f13deb659dd65d0f49f87 --- /dev/null +++ b/stateReadStreamObject_test.go @@ -0,0 +1,38 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestReadStreamObjectStat_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", readStreamObjectState{}, io.EOF, []tokenInterface{}}, + {"example from documentation", "BT\n/F1 12 Tf\n72 712 Td\n(A stream with an indirect length) Tj\nET\nendstream", + endStreamObjectState{}, nil, []tokenInterface{&streamObjectToken{}, &endStreamObjectToken{}}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = readStreamObjectState{ + token: &streamObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateReadStringObject.go b/stateReadStringObject.go index a458efdd5833761919fe578e639cd14cb04e8a70..56b702a24e27df6d19e93108ff67659e439b8178 100644 --- a/stateReadStringObject.go +++ b/stateReadStringObject.go @@ -34,11 +34,6 @@ func (s readStringObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if mode == ModeUnknown { switch { case isLeftParenthesis(c): @@ -67,10 +62,6 @@ func (s readStringObjectState) process(t *Tokenizer) { t.raiseError(err) return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } // A PDF writer may split a literal string across multiple lines. The REVERSE SOLIDUS (5Ch) (backslash //character) at the end of a line shall be used to indicate that the string continues on the following line. A @@ -83,10 +74,7 @@ func (s readStringObjectState) process(t *Tokenizer) { t.raiseError(err) return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } + if !isEndOfLine(c) { err = t.unread() if err != nil { @@ -139,10 +127,6 @@ func (s readStringObjectState) process(t *Tokenizer) { t.raiseError(err) return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } if !isNumeric(c) { err := t.unread() @@ -180,11 +164,6 @@ func (s readStringObjectState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if !isEndOfLine(c) { err := t.unread() if err != nil { diff --git a/stateReadXref.go b/stateReadXref.go new file mode 100644 index 0000000000000000000000000000000000000000..0eda1d2a2dd3f256c59b946d82e1a481b093ac80 --- /dev/null +++ b/stateReadXref.go @@ -0,0 +1,72 @@ +package pdf + +type xrefToken struct { + tokenInternal +} + +type xrefStateToken struct { + tokenInternal + value rune +} + +type readXrefState struct { + token *xrefToken +} + +func (s readXrefState) process(t *Tokenizer) { + + var c rune + var err error + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + switch { + case isEndOfLine(c): + t.switchState(endOfLineState{ + token: &endOfLineToken{}, + }) + continue + + case isWhitespace(c): + + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + continue + + case isNumeric(c): + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(readNumberObjectState{ + token: &numberToken{}, + }) + + continue + + case c == 'n' || c == 'f': + t.emitToken(&xrefStateToken{ + value: c, + }) + + case c == 't': + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.emitToken(s.token) + return + } + } + +} diff --git a/stateXref.go b/stateReadXrefOffset.go similarity index 68% rename from stateXref.go rename to stateReadXrefOffset.go index 04db33ed3ec5c7c7685d8afa5ceb510e837f58cf..17b26201ca9cd4e6a3df4eda2bbfcbd26bfdbdbe 100644 --- a/stateXref.go +++ b/stateReadXrefOffset.go @@ -1,15 +1,15 @@ package pdf -type xrefToken struct { +type xrefOffsetToken struct { tokenInternal byteOffsetXRef int64 } -type xrefState struct { - token *xrefToken +type readXrefOffsetState struct { + token *xrefOffsetToken } -func (s xrefState) process(t *Tokenizer) { +func (s readXrefOffsetState) process(t *Tokenizer) { for { c, err := t.consumeNextRune() @@ -18,11 +18,6 @@ func (s xrefState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { t.switchState(endOfLineState{ token: &endOfLineToken{}, diff --git a/stateXref_test.go b/stateReadXrefOffset_test.go similarity index 71% rename from stateXref_test.go rename to stateReadXrefOffset_test.go index f951bbf58d16f99a19c1f6a883c0f2ee5ec0fd9f..ae6ca55bf6defca949c22a9adf9335847528065e 100644 --- a/stateXref_test.go +++ b/stateReadXrefOffset_test.go @@ -13,15 +13,15 @@ func TestXrefState(t *testing.T) { expectedError error expectTokens []tokenInterface }{ - {"No end of line", "x", xrefState{}, ErrEndOfLineExpected, []tokenInterface{}}, - {"end of line and offset", "\r\n1234\r\n", endOfLineState{}, nil, []tokenInterface{&endOfLineToken{}, &xrefToken{}, &endOfLineToken{}}}, + {"No end of line", "x", readXrefOffsetState{}, ErrEndOfLineExpected, []tokenInterface{}}, + {"end of line and offset", "\r\n1234\r\n", endOfLineState{}, nil, []tokenInterface{&endOfLineToken{}, &xrefOffsetToken{}, &endOfLineToken{}}}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tok := createTestTokenizerWithData(tt.input) - tok.state = xrefState{ - token: &xrefToken{}, + tok.state = readXrefOffsetState{ + token: &xrefOffsetToken{}, } // Verarbeite den Tokenizer diff --git a/stateReadXref_test.go b/stateReadXref_test.go new file mode 100644 index 0000000000000000000000000000000000000000..aca340bb606b16a7af3dd5d8a5ac27852e7fef7c --- /dev/null +++ b/stateReadXref_test.go @@ -0,0 +1 @@ +package pdf diff --git a/stateRunBody.go b/stateRunBody.go index 012f2dcae3e14c4cfe48f0ed90acf209dbec33d0..db0dd7db6e6a20d22b39da080feb757dbeedecad 100644 --- a/stateRunBody.go +++ b/stateRunBody.go @@ -12,7 +12,7 @@ func (s runBodyState) process(t *Tokenizer) { for { t.switchState(newlineState{}) - lastToken := t.getLastToken() + lastToken := *t.peekToken() penultimateToken := t.penultimateToken() if len(t.parseError) == 1 && errors.Is(t.parseError[0], io.EOF) { @@ -30,6 +30,10 @@ func (s runBodyState) process(t *Tokenizer) { } } + if len(t.parseError) > 0 { + return + } + for _, err := range t.parseError { if errors.Is(err, io.EOF) { return diff --git a/stateRunBody_test.go b/stateRunBody_test.go index 6c950024b4d6f046dd0fbde62e8f2cc822891943..d6e5c2eec6aae071b0a557ce604a49351f5cf747 100644 --- a/stateRunBody_test.go +++ b/stateRunBody_test.go @@ -1,7 +1,6 @@ package pdf import ( - "io" "testing" ) @@ -14,8 +13,11 @@ func TestRunBodyState(t *testing.T) { expectedError error expectTokens []tokenInterface }{ - {"Some Text with carriage return", "Some Text\r", endOfLineState{}, io.EOF, []tokenInterface{}}, - {"End of File Token", "%%EOF\r\n", endOfLineState{}, nil, []tokenInterface{&endOfFileToken{}, &endOfLineToken{}}}, + {"Some Text with carriage return", "Some Text\r", + endOfLineState{}, ErrInvalidIndirectObjectIdentifier, + []tokenInterface{ + &endOfLineToken{}, + }}, } for _, tt := range tests { @@ -23,7 +25,6 @@ func TestRunBodyState(t *testing.T) { tok := createTestTokenizerWithData(tt.input) tok.state = runBodyState{} - // Verarbeite den Tokenizer tok.state.process(tok) checkStateAgainstDef(t, tok, tt.expectedState) diff --git a/stateStartArrayObject.go b/stateStartArrayObject.go index c48f7b8b8f3e93b06be1e9b68f047650c214d143..6f25ac1d9f533f148915f4e88c9802215daf6c5c 100644 --- a/stateStartArrayObject.go +++ b/stateStartArrayObject.go @@ -1,10 +1,5 @@ package pdf -// -//type arrayToken struct { -// tokenInternal -//} - type arrayStartToken struct { tokenInternal } @@ -30,16 +25,21 @@ func (s startArrayObjectState) process(t *Tokenizer) { t.emitToken(s.token) - //s.token = &arrayToken{} - // for { - c, err = t.peekNextRune() + c, err = t.consumeNextRune() if err != nil { t.raiseError(err) return } if isRightSquareBracket(c) { + + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(endArrayObjectState{ token: &endArrayObjectToken{}, }) @@ -48,14 +48,18 @@ func (s startArrayObjectState) process(t *Tokenizer) { // Handle whitespace and delimiters if isWhitespace(c) { - c, err = t.consumeNextRune() // consume the ']' that ends the array - if err != nil { - t.raiseError(err) - return - } + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) continue } + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(startValueState{}) } diff --git a/stateStartArrayObject_test.go b/stateStartArrayObject_test.go index c10956fb455f6d78ebb9c99799d111bc1eb66c3f..54bad4a4a5414ca1e9a1482ec23d056ef8b4fca1 100644 --- a/stateStartArrayObject_test.go +++ b/stateStartArrayObject_test.go @@ -15,7 +15,19 @@ func TestStartArrayObjectStateProcess(t *testing.T) { expectTokens []tokenInterface }{ {"empty", "", startArrayObjectState{}, io.EOF, []tokenInterface{}}, - {"example 1 from 7.3.6", "[549 3.14 false (Ralph) /SomeName]", startArrayObjectState{}, io.EOF, []tokenInterface{}}, + {"example 1 from 7.3.6", "[549 3.14 false (Ralph) /SomeName]", endArrayObjectState{}, nil, []tokenInterface{ + &arrayStartToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &booleanToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &nameToken{}, + &endArrayObjectToken{}, + }}, } for _, tt := range tests { diff --git a/stateStartDictionary.go b/stateStartDictionary.go deleted file mode 100644 index bf57a3a0848e060937a1d5e1ca8eb083b1d8fa83..0000000000000000000000000000000000000000 --- a/stateStartDictionary.go +++ /dev/null @@ -1,35 +0,0 @@ -package pdf - -type startDictionaryState struct { - token *dictionaryToken -} - -func (s startDictionaryState) process(t *Tokenizer) { - - c, err := t.consumeNextRune() - - if err != nil { - t.raiseError(err) - return - } - - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - - if isEndOfLine(c) { - t.raiseError(ErrExpectedDictionary) - return - } - - if !isLessThanSign(c) { - t.raiseError(ErrExpectedDictionary) - return - } - - t.switchState(dictionaryState{ - token: s.token, - }) - -} diff --git a/stateStartDictionaryObject.go b/stateStartDictionaryObject.go new file mode 100644 index 0000000000000000000000000000000000000000..1106bf21e68111b2f6ff7007e493ff97ed87c6d3 --- /dev/null +++ b/stateStartDictionaryObject.go @@ -0,0 +1,73 @@ +package pdf + +type startDictionaryObjectToken struct { + tokenInternal +} + +type startDictionaryObjectState struct { + token *startDictionaryObjectToken +} + +func (s startDictionaryObjectState) process(t *Tokenizer) { + + var r []rune + var err error + var c rune + + r, err = t.consumeNextRunes(2) + if err != nil { + t.raiseError(err) + return + } + if len(r) != 2 || !isLessThanSign(r[0]) || !isLessThanSign(r[1]) { + t.raiseError(errDictionaryObjectMustStartWithDoubleLessThanSign) + return + + } + + t.emitToken(s.token) + + for { + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if isGreaterThanSign(c) { + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + err := t.unreadCount(2) + if err != nil { + t.raiseError(err) + return + } + + if isGreaterThanSign(c) { + t.switchState(endDictionaryObjectState{ + token: &endDictionaryObjectToken{}, + }) + return + } + + } + + if isWhitespace(c) { + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + continue + } + + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(startValueState{}) + } +} diff --git a/stateStartDictionaryObject_test.go b/stateStartDictionaryObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..e976157b58b5a2fb6b45893ba2303fdab3f99b32 --- /dev/null +++ b/stateStartDictionaryObject_test.go @@ -0,0 +1,82 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestStartDictionaryState_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"No end of line", "x", startDictionaryObjectState{}, io.EOF, []tokenInterface{}}, + {"Example from Documentation 7.3.7", "<</Type /Example\n/Subtype /DictionaryExample\n/Version 0.01\n/IntegerItem 12\n/StringItem (a string)\n/Subdictionary <<\n/Item1 0.4\n/Item2 true\n/LastItem (not !)\n/VeryLastItem (OK)\n>>\n>>", endDictionaryObjectState{}, nil, []tokenInterface{ + &startDictionaryObjectToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &booleanToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = startDictionaryObjectState{ + token: &startDictionaryObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateStartStartxref.go b/stateStartStartxref.go new file mode 100644 index 0000000000000000000000000000000000000000..470da57b4211d1e00977bbb0a7ac71351069e91e --- /dev/null +++ b/stateStartStartxref.go @@ -0,0 +1,50 @@ +package pdf + +type startStartxrefToken struct { + tokenInternal +} + +type startStartXrefState struct { + token *startStartxrefToken +} + +func (s startStartXrefState) process(t *Tokenizer) { + + var c rune + var err error + var value string + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 's' && c != 'a' && c != 'r' && c != 't' && c != 'x' && c != 'e' && c != 'f' { + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + break + } + + value += string(c) + } + + if value != keywordStartXRef { + t.raiseError(ErrInvalidStartXref) + return + } + + t.emitToken(s.token) + + // Switch to the state that reads the xref offset + t.switchState(readXrefOffsetState{ + token: &xrefOffsetToken{}, + }) + + return +} diff --git a/stateStartStartxref_test.go b/stateStartStartxref_test.go new file mode 100644 index 0000000000000000000000000000000000000000..e8fbe474728f2137ad1d90fe7238bb3316d7a215 --- /dev/null +++ b/stateStartStartxref_test.go @@ -0,0 +1,44 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestStartStartXrefState(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", startStartXrefState{}, io.EOF, []tokenInterface{}}, + {"startxref 1", "startxref", startStartXrefState{}, io.EOF, []tokenInterface{}}, + {"startxref 2", "startxref\n123456\n ", endOfLineState{}, nil, []tokenInterface{ + &startStartxrefToken{}, + &endOfLineToken{}, + &xrefOffsetToken{}, + &endOfLineToken{}, + }}, + {"startxref 3", "st", startStartXrefState{}, io.EOF, []tokenInterface{}}, + {"startxref 4", "st ", startStartXrefState{}, ErrInvalidStartXref, []tokenInterface{}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = startStartXrefState{ + token: &startStartxrefToken{}, + } + + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateStartStreamObject.go b/stateStartStreamObject.go new file mode 100644 index 0000000000000000000000000000000000000000..376893c507b0f521ffa638c98d1a7e9dae023fa4 --- /dev/null +++ b/stateStartStreamObject.go @@ -0,0 +1,48 @@ +package pdf + +type startStreamObjectToken struct { + tokenInternal +} + +type startStreamObjectState struct { + token *startStreamObjectToken +} + +func (s startStreamObjectState) process(t *Tokenizer) { + var c rune + var err error + var value string + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 's' && c != 't' && c != 'r' && c != 'e' && c != 'a' && c != 'm' { + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + break + } + + value += string(c) + } + + if value != keywordStartStream { + t.raiseError(ErrInvalidBoolean) + return + } + + t.emitToken(s.token) + + t.switchState(readStreamObjectState{ + token: &streamObjectToken{}, + }) + + return +} diff --git a/stateStartStreamObject_test.go b/stateStartStreamObject_test.go new file mode 100644 index 0000000000000000000000000000000000000000..973dc9058e67e796c883b5d7aede06b5b2ce1b72 --- /dev/null +++ b/stateStartStreamObject_test.go @@ -0,0 +1,40 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestStartStreamObjectStat_Process(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", startStreamObjectState{}, io.EOF, []tokenInterface{}}, + {"example from documentation", "stream\nBT\n/F1 12 Tf\n72 712 Td\n(A stream with an indirect length) Tj\nET\nendstream", + endStreamObjectState{}, nil, []tokenInterface{&startStreamObjectToken{}, &streamObjectToken{}, &endStreamObjectToken{}}}, + {"example from simple-pdf 2.0", "stream\n<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='Insert XMP tool name here.'>\n <rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>\n <rdf:Description rdf:about=\"\" xmlns:pdf=\"http://ns.adobe.com/pdf/1.3/\">\n <pdf:Producer>Datalogics - example producer program name here</pdf:Producer>\n <pdf:Copyright>Copyright 2017 PDF Association</pdf:Copyright>\n <pdf:Keywords>PDF 2.0 sample example</pdf:Keywords>\n </rdf:Description>\n <rdf:Description rdf:about=\"\" xmlns:xap=\"http://ns.adobe.com/xap/1.0/\">\n <xap:CreateDate>2017-05-24T10:30:11Z</xap:CreateDate>\n <xap:MetadataDate>2017-07-11T07:55:11Z</xap:MetadataDate>\n <xap:ModifyDate>2017-07-11T07:55:11Z</xap:ModifyDate>\n <xap:CreatorTool>Datalogics - example creator tool name here</xap:CreatorTool>\n </rdf:Description>\n <rdf:Description rdf:about=\"\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n <dc:format>application/pdf</dc:format>\n <dc:title>\n <rdf:Alt>\n <rdf:li xml:lang=\"x-default\">A simple PDF 2.0 example file</rdf:li>\n </rdf:Alt>\n </dc:title>\n <dc:creator>\n <rdf:Seq>\n <rdf:li>Datalogics Incorporated</rdf:li>\n </rdf:Seq>\n </dc:creator>\n <dc:description>\n <rdf:Alt>\n <rdf:li xml:lang=\"x-default\">Demonstration of a simple PDF 2.0 file.</rdf:li>\n </rdf:Alt>\n </dc:description>\n <dc:rights>\n <rdf:Alt>\n <rdf:li xml:lang=\"x-default\">Copyright 2017 PDF Association. Licensed to the public under Creative Commons Attribution-ShareAlike 4.0 International license.</rdf:li>\n </rdf:Alt>\n </dc:rights>\n </rdf:Description>\n <rdf:Description rdf:about=\"\" xmlns:xapRights=\"http://ns.adobe.com/xap/1.0/rights/\">\n <xapRights:Marked>True</xapRights:Marked>\n </rdf:Description>\n <rdf:Description rdf:about=\"\" xmlns:cc=\"http://creativecommons.org/ns#\">\n <cc:license rdf:resource=\"https://creativecommons.org/licenses/sa/4.0/\" />\n </rdf:Description>\n <rdf:Description rdf:about=\"\" xmlns:xapMM=\"http://ns.adobe.com/xap/1.0/mm/\">\n <xapMM:DocumentID>uuid:3eef2166-8332-abb4-3d31-77334578873f</xapMM:DocumentID>\n <xapMM:InstanceID>uuid:991bcce7-ee70-11a3-91aa-77bbe2181fd8</xapMM:InstanceID>\n </rdf:Description>\n </rdf:RDF>\n</x:xmpmeta>\nendstream\n", + endStreamObjectState{}, nil, []tokenInterface{&startStreamObjectToken{}, &streamObjectToken{}, &endStreamObjectToken{}}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = startStreamObjectState{ + token: &startStreamObjectToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateStartValue.go b/stateStartValue.go index 1bfb78d296fc06a534d88d9e3781db8686131970..745447a3f639de46f2b6a5bd59ba1b1bb938595b 100644 --- a/stateStartValue.go +++ b/stateStartValue.go @@ -1,9 +1,5 @@ package pdf -import ( - "unicode" -) - type startValueState struct { } @@ -11,7 +7,7 @@ func (s startValueState) process(t *Tokenizer) { for { - c, err := t.peekNextRune() + c, err := t.consumeNextRune() if err != nil { t.raiseError(err) return @@ -19,57 +15,151 @@ func (s startValueState) process(t *Tokenizer) { // Bestimme den Typ basierend auf der ersten Nicht-Whitespace-Rune switch { - case unicode.IsDigit(c) || c == '+' || c == '-': + + case isPercentSign(c): + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(commentState{ + token: &commentToken{}, + }) + return + + case isNumeric(c) || c == '+' || c == '-': + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(readNumberObjectState{ token: &numberToken{}, }) + return case isLeftParenthesis(c): + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(readStringObjectState{ token: &stringToken{}, }) + return case isLessThanSign(c): + + x, err := t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + err = t.unreadCount(2) + if err != nil { + t.raiseError(err) + return + } + + if isLessThanSign(x) { + + t.switchState(startDictionaryObjectState{ + token: &startDictionaryObjectToken{}, + }) + + return + } + t.switchState(readStringObjectState{ token: &stringToken{}, }) + + return case isSolidus(c): + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(readNameObjectState{ token: &nameToken{}, }) + return case isLeftSquareBracket(c): + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(startArrayObjectState{ token: &arrayStartToken{}, }) + return case c == 't' || c == 'f': + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(readBooleanObjectState{ token: &booleanToken{}, }) + return case c == 'n': + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + t.switchState(readNullObjectState{ token: &nullToken{}, }) + return - case isLeftSquareBracket(c): - t.switchState(startArrayObjectState{ - token: &arrayStartToken{}, + case c == 'R': + err := t.unread() + + if err != nil { + t.raiseError(err) + return + } + + t.switchState(readReferenceObjectState{ + token: &referenceObjectToken{}, }) - // @todo: implement dictionary object - //case isLeftCurlyBracket(c): - // t.switchState(startDictionaryObjectState{ - // token: &dictionaryStartToken{}, - // }) - //case c == 'R': - // t.switchState(readIndirectObjectState{ - // token: &indirectObjectToken{}, - // }) + return + case isWhitespace(c): // array or dictionary may contain whitespace return + case c == 's': + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(startStreamObjectState{ + token: &startStreamObjectToken{}, + }) + default: t.raiseError(ErrUnknownValueType) return diff --git a/stateStartValue_test.go b/stateStartValue_test.go index 85ef5cca4ed87314779503ccf8ad9d59f00e32ac..eaa7a919f3b0c9b5045bb5c420d7158467e0435f 100644 --- a/stateStartValue_test.go +++ b/stateStartValue_test.go @@ -15,6 +15,38 @@ func TestStartValueState_Process(t *testing.T) { expectTokens []tokenInterface }{ {"empty", "", startValueState{}, io.EOF, []tokenInterface{}}, + {"a name", "/Name ", readNameObjectState{}, nil, []tokenInterface{&nameToken{}}}, + {"a number", "123 ", readNumberObjectState{}, nil, []tokenInterface{&numberToken{}}}, + {"a string", "(String)", readStringObjectState{}, nil, []tokenInterface{&stringToken{}}}, + {"a hex string", "<4E6F6D65>", readStringObjectState{}, nil, []tokenInterface{&stringToken{}}}, + {"a dictionary", "<<\n/Name (Value)\n>>", endDictionaryObjectState{}, nil, []tokenInterface{ + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &stringToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}}, + }, + {"a boolean", "true ", readBooleanObjectState{}, nil, []tokenInterface{&booleanToken{}}}, + {"a boolean false", "false ", readBooleanObjectState{}, nil, []tokenInterface{&booleanToken{}}}, + {"an array", "[1 2 3] ", endArrayObjectState{}, nil, []tokenInterface{ + &arrayStartToken{}, + &numberToken{}, &whitespaceToken{}, + &numberToken{}, &whitespaceToken{}, + &numberToken{}, + &endArrayObjectToken{}}}, + {"an array with Reference", "[12 0 R] ", endArrayObjectState{}, nil, []tokenInterface{&arrayStartToken{}, + &numberToken{}, &whitespaceToken{}, + &numberToken{}, &whitespaceToken{}, + &referenceObjectToken{}, + &endArrayObjectToken{}}}, + {"a null", "null ", readNullObjectState{}, nil, []tokenInterface{&nullToken{}}}, + {"a reference", "R", readReferenceObjectState{}, nil, []tokenInterface{&referenceObjectToken{}}}, + {"a comment", "% Comment\n ", endOfLineState{}, nil, []tokenInterface{&commentToken{}, &endOfLineToken{}}}, + {"a comment with newline", "% Comment\n\n ", endOfLineState{}, nil, []tokenInterface{&commentToken{}, &endOfLineToken{}}}, + {"a comment with CR", "% Comment\r ", endOfLineState{}, nil, []tokenInterface{&commentToken{}, &endOfLineToken{}}}, + {"a comment with CR LF", "% Comment\r\n ", endOfLineState{}, nil, []tokenInterface{&commentToken{}, &endOfLineToken{}}}, } for _, tt := range tests { diff --git a/stateStartXref.go b/stateStartXref.go new file mode 100644 index 0000000000000000000000000000000000000000..aa83014794511da433b65ab80260b4e9e2bb7861 --- /dev/null +++ b/stateStartXref.go @@ -0,0 +1,49 @@ +package pdf + +type startXrefToken struct { + tokenInternal +} + +type startXrefState struct { + token *startXrefToken +} + +func (s startXrefState) process(t *Tokenizer) { + + var c rune + var err error + var value string + + for { + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + if c != 'x' && c != 'r' && c != 'e' && c != 'f' { + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + break + } + + value += string(c) + } + + if value != keywordXRef { + t.raiseError(ErrInvalidXref) + return + } + + t.emitToken(s.token) + + t.switchState(readXrefState{ + token: &xrefToken{}, + }) + + return +} diff --git a/stateStartXref_test.go b/stateStartXref_test.go new file mode 100644 index 0000000000000000000000000000000000000000..03bdf6421fbbb2c701e4676f747d182f2b343df4 --- /dev/null +++ b/stateStartXref_test.go @@ -0,0 +1,108 @@ +package pdf + +import ( + "io" + "testing" +) + +func TestStartXrefState(t *testing.T) { + + tests := []struct { + name string + input string + expectedState state + expectedError error + expectTokens []tokenInterface + }{ + {"empty", "", startXrefState{}, io.EOF, []tokenInterface{}}, + + {"xref from the simple sample file", "xref\n0 10\n0000000000 65535 f\n0000000016 00000 n\n0000000096 00000 n\n0000002547 00000 n\n0000002619 00000 n\n0000002782 00000 n\n0000003587 00000 n\n0000003811 00000 n\n0000003972 00000 n\n0000004524 00000 n\ntrailer", + endOfLineState{}, nil, []tokenInterface{ + + &startXrefToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &xrefStateToken{}, + &endOfLineToken{}, + &xrefToken{}, + }}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + tok.state = startXrefState{ + token: &startXrefToken{}, + } + + // Verarbeite den Tokenizer + tok.state.process(tok) + + checkStateAgainstDef(t, tok, tt.expectedState) + checkErrors(t, tok, tt.expectedError) + checkTokenStack(t, tok, tt.expectTokens) + + }) + } +} diff --git a/stateTrailer.go b/stateTrailer.go index 272953474a37455154a4f4ec8d393e18ed062b87..eb1547033a2931989decdc76e0ba16449b9fc69c 100644 --- a/stateTrailer.go +++ b/stateTrailer.go @@ -10,21 +10,13 @@ type trailerStartState struct { func (s trailerStartState) process(t *Tokenizer) { - c, err := t.consumeNextRune() + _, err := t.consumeNextRune() if err != nil { t.raiseError(err) return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - - if !isEndOfLine(c) { - t.raiseError(ErrExpectedEndOfLine) - return - } + t.emitToken(s.token) for { @@ -34,20 +26,71 @@ func (s trailerStartState) process(t *Tokenizer) { return } - if isEndOfFile(c) { - t.raiseError(ErrUnexpectedEOF) - return - } - if isEndOfLine(c) { continue } switch { case isLessThanSign(c): - t.switchState(startDictionaryState{ - token: &dictionaryToken{}, + + err := t.unread() + if err != nil { + t.raiseError(err) + return + } + + t.switchState(startDictionaryObjectState{ + token: &startDictionaryObjectToken{}, }) + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + + if isWhitespace(c) { + + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + } + + t.switchState(startStartXrefState{ + token: &startStartxrefToken{}, + }) + + c, err = t.consumeNextRune() + if err != nil { + t.raiseError(err) + return + } + + err = t.unread() + if err != nil { + t.raiseError(err) + return + } + + if isWhitespace(c) { + + t.switchState(whitespaceState{ + token: &whitespaceToken{}, + }) + } + + t.switchState(endOfFileState{ + token: &endOfFileToken{}, + }) + + return + } //if startDict == keywordStartDictionary { diff --git a/stateTrailer_test.go b/stateTrailer_test.go index 0770eedfa5b2e9233117d643c3e20b4695ea16a4..544f45fcc04f1d0420ebb3957ba2d94f8af14c7d 100644 --- a/stateTrailer_test.go +++ b/stateTrailer_test.go @@ -14,8 +14,44 @@ func TestTrailerStartState(t *testing.T) { expectedError error expectTokens []tokenInterface }{ - {"Only whitespaces", " ", whitespaceState{}, io.EOF, []tokenInterface{&whitespaceToken{}}}, - {"Simple Trailer example", "trailer\n<<\n/Size 22\n/Root 1 0 R\n/Info 2 0 R\n>>\nstartxref\n123456\n%%EOF\n", whitespaceState{}, io.EOF, []tokenInterface{&whitespaceToken{}}}, + {"Only whitespaces", " ", trailerStartState{}, io.EOF, []tokenInterface{&trailerToken{}}}, + {"Simple Trailer example", "trailer\n<<\n/Size 22\n/Root 1 0 R\n/Info 2 0 R\n>>\nstartxref\n123456\n%%EOF\n", + endOfLineState{}, nil, + []tokenInterface{ + + &trailerToken{}, + &startDictionaryObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &nameToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &numberToken{}, + &whitespaceToken{}, + &referenceObjectToken{}, + &whitespaceToken{}, + &endDictionaryObjectToken{}, + &whitespaceToken{}, + &endOfLineToken{}, + &startStartxrefToken{}, + &endOfLineToken{}, + &xrefOffsetToken{}, + &endOfLineToken{}, + &endOfFileToken{}, + &endOfLineToken{}, + }}, } for _, tt := range tests { diff --git a/stateWhitespace.go b/stateWhitespace.go index 1b1fed488ab0f9fe4a3bbc6bd0ab7711a5bd9597..1e359948b50896bc40c156b1eba8c3546ffb8718 100644 --- a/stateWhitespace.go +++ b/stateWhitespace.go @@ -18,10 +18,6 @@ func (s whitespaceState) process(t *Tokenizer) { break } - if isEndOfFile(c) { - break - } - if isEndOfLine(c) { t.emitToken(s.token) t.switchState(endOfLineState{ diff --git a/tokenizer.go b/tokenizer.go index c6a6f514d1c02f7ebf98ea281ec40d83b17e89e5..16769580fd9dc20faeafed286308cf7c0d8bb16d 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -2,6 +2,7 @@ package pdf import ( "bufio" + "errors" "fmt" "io" "reflect" @@ -13,28 +14,12 @@ func NewTokenizer(r io.Reader) *Tokenizer { reader: bufio.NewReader(r), tokenStack: make([]tokenInterface, 0), parseError: make([]error, 0), - //state: initialState{}, - //tmpBuf: "", - tmpBuf: []rune{}, - lastRune: []rune{}, - mu: sync.Mutex{}, + tmpBuf: []rune{}, + lastRune: []rune{}, + mu: sync.Mutex{}, } } -//type StateType int - -//const ( -// InitialStateType StateType = iota + 1 -// CommentStateType -// WhitespaceStateType -// RegularStateType -// NumberStateType -// NameStateType -// LiteralStringStateType -// HexStringStateType -// KeywordStateType -//) - type Tokenizer struct { reader *bufio.Reader tokenStack []tokenInterface @@ -51,23 +36,20 @@ type Tokenizer struct { mu sync.Mutex } -//func (t *Tokenizer) NextToken() (*tokenInterface, error) { -// line, err := t.reader.ReadString('\n') -// if err != nil { -// return nil, err -// } -// line = strings.TrimSpace(line) -// if strings.HasPrefix(line, "%") { -// return &tokenInterface{Type: Comment, Value: line}, nil -// } -// // Füge hier weitere Regeln hinzu, um andere Typen von tokenInterface zu identifizieren -// return &tokenInterface{Type: Unknown, Value: line}, nil -//} - func (t *Tokenizer) raiseError(err error) { t.mu.Lock() defer t.mu.Unlock() + if !errors.Is(err, ErrInvalidIndirectEndObjectKeyword) { + fmt.Println("Error:", err) + } + + for _, e := range t.parseError { + if errors.Is(e, err) { + return + } + } + t.parseError = append(t.parseError, err) } @@ -78,44 +60,40 @@ func (t *Tokenizer) Errors() []error { return t.parseError } -func (t *Tokenizer) HasError() bool { +func (t *Tokenizer) isLastError(err error) bool { t.mu.Lock() defer t.mu.Unlock() - return len(t.parseError) > 0 -} + if len(t.parseError) == 0 { + return false + } -//func (t *Tokenizer) GetTokenStack() []tokenInterface { -// return t.tokenStack -//} + return errors.Is(t.parseError[len(t.parseError)-1], err) +} -func (t *Tokenizer) PushToken(o tokenInterface) { +func (t *Tokenizer) removeLastError() { t.mu.Lock() defer t.mu.Unlock() - t.tokenStack = append(t.tokenStack, o) + if len(t.parseError) == 0 { + return + } + + t.parseError = t.parseError[:len(t.parseError)-1] } -func (t *Tokenizer) popToken() *tokenInterface { +func (t *Tokenizer) hasError() bool { t.mu.Lock() defer t.mu.Unlock() - if len(t.tokenStack) == 0 { - return nil - } - token := t.tokenStack[len(t.tokenStack)-1] - t.tokenStack = t.tokenStack[:len(t.tokenStack)-1] - return &token + return len(t.parseError) > 0 } -func (t *Tokenizer) peekToken() *tokenInterface { +func (t *Tokenizer) pushToken(o tokenInterface) { t.mu.Lock() defer t.mu.Unlock() - if len(t.tokenStack) == 0 { - return nil - } - return &t.tokenStack[len(t.tokenStack)-1] + t.tokenStack = append(t.tokenStack, o) } func (t *Tokenizer) emitToken(o tokenInterface) { @@ -131,35 +109,9 @@ func (t *Tokenizer) emitToken(o tokenInterface) { o.emit() o.setByteOffset(int64(t.byteOffset)) - t.PushToken(o) + t.pushToken(o) } -//func (t *Tokenizer) tokenCount() int { -// return len(t.tokenStack) -//} - -//func (t *Tokenizer) resetTempBuffer() { -// t.tmpBuf = []rune{} -//} -// -//func (t *Tokenizer) appendToTempBuffer(c rune) { -// t.tmpBuf = append(t.tmpBuf, c) -// t.byteOffset += len([]byte(string(c))) -//} - -//func (t *Tokenizer) setTempBuffer(r []rune) { -// t.tmpBuf = r -// t.byteOffset -= len(r) -//} - -//func (t *Tokenizer) tempBuffer() string { -// return string(t.tmpBuf) -//} - -//func (t *Tokenizer) getTempBuffer() []rune { -// return t.tmpBuf -//} - func (t *Tokenizer) peekNextRune() (rune, error) { r, err := t.consumeNextRune() if err != nil { @@ -188,6 +140,18 @@ func (t *Tokenizer) peekNextRunes(count int) ([]rune, error) { return runes, nil } +func (t *Tokenizer) consumeNextRunes(count int) ([]rune, error) { + runes := make([]rune, 0) + for i := 0; i < count; i++ { + r, err := t.consumeNextRune() + if err != nil { + return nil, err + } + runes = append(runes, r) + } + return runes, nil +} + func (t *Tokenizer) consumeNextRune() (rune, error) { t.mu.Lock() defer t.mu.Unlock() @@ -199,8 +163,6 @@ func (t *Tokenizer) consumeNextRune() (rune, error) { t.byteOffset += len([]byte(string(c))) t.lastRune = append(t.lastRune, c) - fmt.Println("consumeNextRune (tmp): ", c, string(c), t.byteOffset) - return c, nil } @@ -213,8 +175,6 @@ func (t *Tokenizer) consumeNextRune() (rune, error) { t.lastRune = append(t.lastRune, c) t.lastRuneSize = s - fmt.Println("consumeNextRune: ", c, string(c), t.byteOffset) - return c, nil } @@ -234,8 +194,6 @@ func (t *Tokenizer) unread() error { t.tmpBuf = append([]rune{u}, t.tmpBuf...) - fmt.Println("unread: ", u, string(u), t.byteOffset) - return nil } @@ -254,44 +212,44 @@ func (t *Tokenizer) unreadCount(count int) error { t.byteOffset -= len([]byte(string(u))) } - fmt.Println("unreadCount: ", count, t.byteOffset) - return nil } -//func (t *Tokenizer) reconsumeCharacter(count int) { -// for i := 0; i < count; i++ { -// _ = t.reader.UnreadRune() -// } -//} - func (t *Tokenizer) switchState(state state) { t.mu.Lock() t.state = state t.mu.Unlock() - state.process(t) } -// func (t *Tokenizer) currentState() state { -// return t.state -// } -func (t *Tokenizer) getLastToken() tokenInterface { +func (t *Tokenizer) penultimateToken() tokenInterface { + t.mu.Lock() + defer t.mu.Unlock() + + if len(t.tokenStack) < 2 { + return nil + } + return t.tokenStack[len(t.tokenStack)-2] +} + +func (t *Tokenizer) popToken() *tokenInterface { t.mu.Lock() defer t.mu.Unlock() if len(t.tokenStack) == 0 { return nil } - return t.tokenStack[len(t.tokenStack)-1] + token := t.tokenStack[len(t.tokenStack)-1] + t.tokenStack = t.tokenStack[:len(t.tokenStack)-1] + return &token } -func (t *Tokenizer) penultimateToken() tokenInterface { +func (t *Tokenizer) peekToken() *tokenInterface { t.mu.Lock() defer t.mu.Unlock() - if len(t.tokenStack) < 2 { + if len(t.tokenStack) == 0 { return nil } - return t.tokenStack[len(t.tokenStack)-2] + return &t.tokenStack[len(t.tokenStack)-1] } diff --git a/tokenizer_test.go b/tokenizer_test.go index e3fa023c558ede42726c89bcea24e409e2719ae6..a0319265d90376963b80172aef10ded3b83b6f53 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -43,12 +43,12 @@ func TestTokenizerRaiseError(t *testing.T) { tok := createTestTokenizer(t) - if tok.HasError() == true { + if tok.hasError() == true { t.Errorf("Exspected true, got false") } tok.raiseError(ErrTokenizerTest) - if tok.HasError() == false { + if tok.hasError() == false { t.Errorf("Exspected false, got true") } @@ -270,7 +270,7 @@ func TestTokenizerGetLastToken(t *testing.T) { tok := createTestTokenizerWithData("") // Teste für den Fall, dass der token-Stack leer ist - token := tok.getLastToken() + token := tok.peekToken() if token != nil { t.Errorf("expected nil when no tokens are available, got %v", token) } @@ -279,7 +279,7 @@ func TestTokenizerGetLastToken(t *testing.T) { mockToken := &mockToken{} tok.tokenStack = append(tok.tokenStack, mockToken) - lastToken := tok.getLastToken() + lastToken := *tok.peekToken() if lastToken != mockToken { t.Errorf("expected last token to be %v, got %v", mockToken, lastToken) } @@ -327,7 +327,42 @@ func TestPeekNextRunes(t *testing.T) { t.Run(tt.name, func(t *testing.T) { tok := createTestTokenizerWithData(tt.input) - r, e := tok.peekNextRunes(tt.count) + for i := 0; i < 2; i++ { + r, e := tok.peekNextRunes(tt.count) + if tt.expectedError != nil { + if !errors.Is(e, tt.expectedError) { + t.Errorf("expected error %v, got %v", tt.expectedError, e) + } + return + } + + if string(r) != tt.expected { + t.Errorf("expected %v, got %v", tt.expected, string(r)) + } + } + }) + } +} + +func TestConsumeNextRunes(t *testing.T) { + tests := []struct { + name string + input string + count int + expected string + expected2 string + expectedError error + expectedError2 error + }{ + {"Single character", "a", 1, "a", "", nil, io.EOF}, + {"Multiple characters", "abc", 2, "ab", "c", nil, nil}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tok := createTestTokenizerWithData(tt.input) + + r, e := tok.consumeNextRunes(tt.count) if tt.expectedError != nil { if !errors.Is(e, tt.expectedError) { t.Errorf("expected error %v, got %v", tt.expectedError, e) @@ -339,6 +374,32 @@ func TestPeekNextRunes(t *testing.T) { t.Errorf("expected %v, got %v", tt.expected, string(r)) } + r2, e2 := tok.consumeNextRune() + if tt.expectedError2 != nil { + if !errors.Is(e2, tt.expectedError2) { + t.Errorf("expected error %v, got %v", tt.expectedError, e2) + } + return + } + + if string(r2) != tt.expected2 { + t.Errorf("expected %v, got %v", tt.expected, string(r2)) + } }) } } + +func TestSimpleDocumentParser_Parse(t *testing.T) { + pdfDocument := "%PDF-1.4\n1 0 obj\n<<\n /Type /Catalog\n /Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n /Type /Pages\n /Kids [3 0 R]\n /Count 1\n>>\nendobj\n3 0 obj\n<<\n /Type /Page\n /Parent 2 0 R\n /MediaBox [0 0 300 144]\n /Contents 4 0 R\n>>\nendobj\n4 0 obj\n<<\n /Length 44\n>>\nstream\nBT /F1 18 Tf 0 0 Td (Hallo Welt) Tj ET\nendstream\nendobj\nstartxref\n1234\n%%EOF\n" + + tok := createTestTokenizerWithData(pdfDocument) + tok.switchState(initialState{}) + + if tok.hasError() == true { + t.Errorf("Exspected true, got false") + for _, e := range tok.Errors() { + t.Errorf("Expected no errors, got %v", e) + } + } + +}