Skip to content
Snippets Groups Projects
Verified Commit 61a52cba authored by Volker Schukai's avatar Volker Schukai :alien:
Browse files

feat: implement tokenizer

parent 4b5d06d4
No related branches found
No related tags found
No related merge requests found
Pipeline #30727 passed
Showing
with 286 additions and 172 deletions
......@@ -4,6 +4,7 @@ import (
"encoding/hex"
"fmt"
"strconv"
"strings"
"unicode"
)
......@@ -65,6 +66,8 @@ func octalToRune(octal []rune) rune {
}
func hexToString(hexStr string) (string, error) {
// Remove spaces from the hex string to ensure clean decoding
hexStr = strings.ReplaceAll(hexStr, " ", "")
if len(hexStr)%2 != 0 {
hexStr += "0"
......
......@@ -58,3 +58,17 @@ func TestIsLeftParenthesis(t *testing.T) {
}
}
}
func TestHexToString(t *testing.T) {
hexString := "EF BB BF E8 A1 A8 E3 83 9D E3 81 82 41 E9 B7 97 C5 92 C3 A9 EF BC A2 E9 80 8D C3 9C C3 9F C2 AA C4 85 C3 B1 E4 B8 82 E3 90 80 F0 A0 80 80"
expected := "\ufeff表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀"
result, err := hexToString(hexString)
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
if result != expected {
t.Errorf("Expected %q, got %q", expected, result)
}
}
package pdf
const MagicNumber = "PDF-"
const EndOfFileMagicNumber = "%EOF"
const EndOfFileMagicNumber = "%%EOF"
const keywordXRef = "xref"
const keywordStartXRef = "startxref"
const keywordStartObj = "obj"
// const keywordStartObj = "obj"
const keywordEndObj = "endobj"
const keywordStartStream = "stream"
const keywordEndStream = "endstream"
const keywordStartTrailer = "trailer"
const keywordStartDictionary = "<<"
const keywordEndDictionary = ">>"
//const keywordStartDictionary = "<<"
//const keywordEndDictionary = ">>"
......@@ -21,4 +21,18 @@ var (
errArrayObjectMustStartWithBracket = errors.New("array object must start with a bracket")
errArrayObjectMustEndWithBracket = errors.New("array object must end with a bracket")
ErrInvalidNull = errors.New("invalid null")
errDictionaryObjectMustStartWithDoubleLessThanSign = errors.New("dictionary object must start with double less than sign")
errDictionaryObjectMustEndWithDoubleGreaterThanSign = errors.New("dictionary object must End with double greater than sign")
ErrInvalidEndStream = errors.New("invalid end stream")
ErrInvalidObjectLineFormat = errors.New("invalid object line format")
ErrUnexpectedEndOfLine = errors.New("unexpected end of line")
ErrInvalidIndirectObjectIdentifier = errors.New("invalid indirect object identifier")
ErrInvalidIndirectObjectKeyword = errors.New("invalid indirect object keyword")
ErrInvalidIndirectObjectGeneration = errors.New("invalid indirect object generation")
ErrInvalidIndirectEndObjectKeyword = errors.New("invalid indirect end object keyword")
ErrInvalidReferenceObject = errors.New("invalid reference object")
ErrInvalidStartXref = errors.New("invalid startxref")
ErrUnexpectedNullToken = errors.New("unexpected null token")
ErrInvalidXref = errors.New("invalid xref")
ErrInvalidXrefSyntax = errors.New("invalid xref syntax")
)
......@@ -14,19 +14,33 @@ func TestNewParser(t *testing.T) {
func TestParser_Parse(t *testing.T) {
file, err := os.Open("assets/Simple-PDF-2.0-file.pdf")
files := []string{
"pdf20-utf8-test.pdf",
"Simple-PDF-2.0-file.pdf",
"PDF-2.0-image-with-BPC.pdf",
"PDF-2.0-UTF-8-string-and-annotation.pdf",
"PDF-2.0-via-incremental-save.pdf",
"PDF-2.0-with-offset-start.pdf",
"PDF-2.0-with-page-level-output-intent.pdf",
}
for _, f := range files {
t.Run("TestParser_Parse "+f, func(t *testing.T) {
file, err := os.Open("assets/" + f)
if err != nil {
t.Errorf("Expected to open the file, got %v", err)
os.Exit(1)
return
}
p := NewParser(file)
p.Parse()
if p.HasError() {
if p.HasError() == true {
for _, e := range p.GetErrors() {
t.Errorf("Expected no errors, got %v", e)
}
}
})
}
}
......@@ -17,11 +17,6 @@ func (s beforeDictionaryValueState) process(t *Tokenizer) {
return
}
if isEndOfFile(c) {
t.raiseError(ErrUnexpectedEOF)
return
}
if isEndOfLine(c) {
continue
}
......
package pdf
type beforeKeyState struct {
token *dictionaryToken
}
func (s beforeKeyState) process(t *Tokenizer) {
var c rune
var err error
for {
c, err = t.consumeNextRune()
if err != nil {
t.raiseError(err)
return
}
if isEndOfFile(c) {
t.raiseError(ErrUnexpectedEOF)
return
}
if isEndOfLine(c) {
continue
}
if !isWhitespace(c) {
break
}
}
if isSolidus(c) {
t.switchState(dictionaryKeyState{
token: s.token,
})
}
return
}
......@@ -3,7 +3,7 @@ package pdf
import "strconv"
type byteOffsetOfLastCrossReferenceSectionState struct {
token *xrefToken
token *xrefOffsetToken
}
func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) {
......@@ -21,11 +21,6 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) {
return
}
if isEndOfFile(c) {
t.raiseError(ErrUnexpectedEOF)
return
}
if isEndOfLine(c) {
o, err := strconv.ParseInt(string(s.token.values), 10, 64)
......@@ -40,7 +35,7 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) {
t.switchState(endOfLineState{
token: &endOfLineToken{},
})
break
return
}
if !isNumeric(c) {
......@@ -51,7 +46,4 @@ func (s byteOffsetOfLastCrossReferenceSectionState) process(t *Tokenizer) {
s.token.values = append(s.token.values, c)
}
return
}
......@@ -16,14 +16,14 @@ func TestByteOffsetLastCrossReferenceSectionState(t *testing.T) {
}{
{"No numeric value", "x", byteOffsetOfLastCrossReferenceSectionState{}, ErrInvalidByteOffset, []tokenInterface{}},
{"numeric value", "1234", byteOffsetOfLastCrossReferenceSectionState{}, io.EOF, []tokenInterface{}},
{"numeric value with linefeed", "1234\n", endOfLineState{}, io.EOF, []tokenInterface{&xrefToken{}}},
{"numeric value with linefeed", "1234\n", endOfLineState{}, io.EOF, []tokenInterface{&xrefOffsetToken{}}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tok := createTestTokenizerWithData(tt.input)
tok.state = byteOffsetOfLastCrossReferenceSectionState{
token: &xrefToken{},
token: &xrefOffsetToken{},
}
// Verarbeite den Tokenizer
......@@ -40,7 +40,7 @@ func TestByteOffsetLastCrossReferenceSectionState(t *testing.T) {
func TestByteOffsetLastCrossReferenceSectionStateValue(t *testing.T) {
tok := createTestTokenizerWithData("1\n")
tok.state = byteOffsetOfLastCrossReferenceSectionState{
token: &xrefToken{},
token: &xrefOffsetToken{},
}
// Verarbeite den Tokenizer
......@@ -48,10 +48,10 @@ func TestByteOffsetLastCrossReferenceSectionStateValue(t *testing.T) {
checkStateAgainstDef(t, tok, endOfLineState{})
checkErrors(t, tok, io.EOF)
checkTokenStack(t, tok, []tokenInterface{&xrefToken{}})
checkTokenStack(t, tok, []tokenInterface{&xrefOffsetToken{}})
if tok.getLastToken().(*xrefToken).byteOffsetXRef != 1 {
t.Errorf("Expected byte offset to be 1, got %d", tok.getLastToken().(*xrefToken).byteOffsetXRef)
if (*tok.peekToken()).(*xrefOffsetToken).byteOffsetXRef != 1 {
t.Errorf("Expected byte offset to be 1, got %d", (*tok.peekToken()).(*xrefOffsetToken).byteOffsetXRef)
}
}
......@@ -29,19 +29,19 @@ func (s commentState) process(t *Tokenizer) {
return
}
if isEndOfFile(c) {
if isEndOfLine(c) {
t.emitToken(s.token)
t.raiseError(ErrUnexpectedEOF)
err := t.unread()
if err != nil {
t.raiseError(err)
return
}
if isEndOfLine(c) {
t.emitToken(s.token)
t.switchState(endOfLineState{
token: &endOfLineToken{},
})
break
return
}
s.token.values = append(s.token.values, c)
......
package pdf
type dictionaryToken struct {
tokenInternal
}
type dictionaryState struct {
token *dictionaryToken
}
func (s dictionaryState) process(t *Tokenizer) {
}
package pdf
type keywordToken struct {
tokenInternal
}
type dictionaryKeyState struct {
token *dictionaryToken
}
func (s dictionaryKeyState) process(t *Tokenizer) {
var c rune
var err error
var key string
for {
c, err = t.consumeNextRune()
if err != nil {
t.raiseError(err)
return
}
if isEndOfFile(c) {
t.raiseError(ErrUnexpectedEOF)
return
}
if isEndOfLine(c) {
t.raiseError(ErrExpectedDictionary)
return
}
if isWhitespace(c) {
t.switchState(beforeDictionaryValueState{})
return
}
key += string(c)
}
}
package pdf
type endDictionaryObjectToken struct {
tokenInternal
}
type endDictionaryObjectState struct {
token *endDictionaryObjectToken
}
func (s endDictionaryObjectState) process(t *Tokenizer) {
var err error
var r []rune
r, err = t.consumeNextRunes(2)
if err != nil {
t.raiseError(err)
return
}
if len(r) != 2 || !isGreaterThanSign(r[0]) || !isGreaterThanSign(r[1]) {
t.raiseError(errDictionaryObjectMustEndWithDoubleGreaterThanSign)
return
}
t.emitToken(s.token)
}
......@@ -21,11 +21,6 @@ func (s endOfFileState) process(t *Tokenizer) {
return
}
if isEndOfFile(c) {
t.raiseError(ErrUnexpectedEOF)
return
}
if c == 'E' || c == 'O' || c == 'F' || isPercentSign(c) {
s.token.values = append(s.token.values, c)
......@@ -42,11 +37,19 @@ func (s endOfFileState) process(t *Tokenizer) {
}
if string(s.token.values) != EndOfFileMagicNumber {
err := t.unreadCount(len(s.token.values))
if s.token.values == nil {
return
}
l := len(s.token.values)
if l > 0 {
err := t.unreadCount(l)
if err != nil {
t.raiseError(err)
return
}
}
return
}
......
......@@ -16,7 +16,7 @@ func TestEndOfFileState(t *testing.T) {
}{
{"Empty File", "", endOfFileState{}, io.EOF, []tokenInterface{}},
{"End of file", "EOF", endOfFileState{}, io.EOF, []tokenInterface{}},
{"End of file with end of line", "%EOF\n", endOfLineState{}, nil, []tokenInterface{&endOfFileToken{}, &endOfLineToken{}}},
{"End of file with end of line", "%%EOF\n", endOfLineState{}, nil, []tokenInterface{&endOfFileToken{}, &endOfLineToken{}}},
}
for _, tt := range tests {
......@@ -44,7 +44,7 @@ func TestEndOfFileTokenIsNil(t *testing.T) {
// Verarbeite den Tokenizer
tok.state.process(tok)
if !tok.HasError() {
if !tok.hasError() {
t.Errorf("Expected error, got none")
}
}
package pdf
type endOfIndirectObjectToken struct {
tokenInternal
}
type endOfIndirectObjectState struct {
token *endOfIndirectObjectToken
}
func (s endOfIndirectObjectState) process(t *Tokenizer) {
var c rune
var err error
var value string
for {
c, err = t.consumeNextRune()
if err != nil {
t.raiseError(err)
return
}
if c != 'e' && c != 'n' && c != 'd' && c != 'o' && c != 'b' && c != 'j' {
err := t.unread()
if err != nil {
t.raiseError(err)
return
}
break
}
value += string(c)
}
if value != keywordEndObj {
t.raiseError(ErrInvalidIndirectEndObjectKeyword)
return
}
t.emitToken(s.token)
return
}
package pdf
import (
"io"
"testing"
)
func TestEndIndirectObjectStat_Process(t *testing.T) {
tests := []struct {
name string
input string
expectedState state
expectedError error
expectTokens []tokenInterface
}{
{"empty", "", endOfIndirectObjectState{}, io.EOF, []tokenInterface{}},
{"emd of stream", "endobj ",
endOfIndirectObjectState{}, nil, []tokenInterface{&endOfIndirectObjectToken{}}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tok := createTestTokenizerWithData(tt.input)
tok.state = endOfIndirectObjectState{
token: &endOfIndirectObjectToken{},
}
// Verarbeite den Tokenizer
tok.state.process(tok)
checkStateAgainstDef(t, tok, tt.expectedState)
checkErrors(t, tok, tt.expectedError)
checkTokenStack(t, tok, tt.expectTokens)
})
}
}
......@@ -41,7 +41,7 @@ func TestEndOfLineTokenIsNil(t *testing.T) {
// Verarbeite den Tokenizer
tok.state.process(tok)
if !tok.HasError() {
if !tok.hasError() {
t.Errorf("Expected error, got none")
}
}
package pdf
type endStreamObjectToken struct {
tokenInternal
}
type endStreamObjectState struct {
token *endStreamObjectToken
}
func (s endStreamObjectState) process(t *Tokenizer) {
var r []rune
var err error
r, err = t.consumeNextRunes(9)
if err != nil {
t.raiseError(err)
return
}
if string(r) != keywordEndStream {
t.raiseError(ErrInvalidEndStream)
return
}
t.emitToken(s.token)
}
package pdf
import (
"io"
"testing"
)
func TestEndStreamObjectStat_Process(t *testing.T) {
tests := []struct {
name string
input string
expectedState state
expectedError error
expectTokens []tokenInterface
}{
{"empty", "", endStreamObjectState{}, io.EOF, []tokenInterface{}},
{"emd of stream", "endstream",
endStreamObjectState{}, nil, []tokenInterface{&endStreamObjectToken{}}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tok := createTestTokenizerWithData(tt.input)
tok.state = endStreamObjectState{
token: &endStreamObjectToken{},
}
// Verarbeite den Tokenizer
tok.state.process(tok)
checkStateAgainstDef(t, tok, tt.expectedState)
checkErrors(t, tok, tt.expectedError)
checkTokenStack(t, tok, tt.expectTokens)
})
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment