ProcessWord support UTF-8

modified PorcessWord to working normally for UTF-8 strings and added test cases

Signed-off-by: Daehyeok Mun <daehyeok@gmail.com>
This commit is contained in:
Daehyeok Mun 2015-10-18 21:55:53 -06:00
parent bb5551746b
commit bb79b7eb9e
3 changed files with 99 additions and 51 deletions

View File

@ -9,13 +9,15 @@ package dockerfile
import (
"fmt"
"strings"
"text/scanner"
"unicode"
)
type shellWord struct {
word string
envs []string
pos int
word string
scanner scanner.Scanner
envs []string
pos int
}
// ProcessWord will use the 'env' list of environment variables,
@ -26,11 +28,12 @@ func ProcessWord(word string, env []string) (string, error) {
envs: env,
pos: 0,
}
sw.scanner.Init(strings.NewReader(word))
return sw.process()
}
func (sw *shellWord) process() (string, error) {
return sw.processStopOn('\000')
return sw.processStopOn(scanner.EOF)
}
// Process the word, starting at 'pos', and stop when we get to the
@ -43,10 +46,11 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
'$': sw.processDollar,
}
for sw.pos < len(sw.word) {
ch := sw.peek()
if stopChar != '\000' && ch == stopChar {
sw.next()
for sw.scanner.Peek() != scanner.EOF {
ch := sw.scanner.Peek()
if stopChar != scanner.EOF && ch == stopChar {
sw.scanner.Next()
break
}
if fn, ok := charFuncMapping[ch]; ok {
@ -58,14 +62,19 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
result += tmp
} else {
// Not special, just add it to the result
ch = sw.next()
ch = sw.scanner.Next()
if ch == '\\' {
// '\' escapes, except end of line
ch = sw.next()
if ch == '\000' {
continue
ch = sw.scanner.Next()
if ch == scanner.EOF {
break
}
}
result += string(ch)
}
}
@ -73,36 +82,21 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
return result, nil
}
func (sw *shellWord) peek() rune {
if sw.pos == len(sw.word) {
return '\000'
}
return rune(sw.word[sw.pos])
}
func (sw *shellWord) next() rune {
if sw.pos == len(sw.word) {
return '\000'
}
ch := rune(sw.word[sw.pos])
sw.pos++
return ch
}
func (sw *shellWord) processSingleQuote() (string, error) {
// All chars between single quotes are taken as-is
// Note, you can't escape '
var result string
sw.next()
sw.scanner.Next()
for {
ch := sw.next()
if ch == '\000' || ch == '\'' {
ch := sw.scanner.Next()
if ch == '\'' || ch == scanner.EOF {
break
}
result += string(ch)
}
return result, nil
}
@ -111,12 +105,12 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
// But you can escape " with a \
var result string
sw.next()
sw.scanner.Next()
for sw.pos < len(sw.word) {
ch := sw.peek()
for sw.scanner.Peek() != scanner.EOF {
ch := sw.scanner.Peek()
if ch == '"' {
sw.next()
sw.scanner.Next()
break
}
if ch == '$' {
@ -126,18 +120,18 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
}
result += tmp
} else {
ch = sw.next()
ch = sw.scanner.Next()
if ch == '\\' {
chNext := sw.peek()
chNext := sw.scanner.Peek()
if chNext == '\000' {
if chNext == scanner.EOF {
// Ignore \ at end of word
continue
}
if chNext == '"' || chNext == '$' {
// \" and \$ can be escaped, all other \'s are left as-is
ch = sw.next()
ch = sw.scanner.Next()
}
}
result += string(ch)
@ -148,23 +142,23 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
}
func (sw *shellWord) processDollar() (string, error) {
sw.next()
ch := sw.peek()
sw.scanner.Next()
ch := sw.scanner.Peek()
if ch == '{' {
sw.next()
sw.scanner.Next()
name := sw.processName()
ch = sw.peek()
ch = sw.scanner.Peek()
if ch == '}' {
// Normal ${xx} case
sw.next()
sw.scanner.Next()
return sw.getEnv(name), nil
}
if ch == ':' {
// Special ${xx:...} format processing
// Yes it allows for recursive $'s in the ... spot
sw.next() // skip over :
modifier := sw.next()
sw.scanner.Next() // skip over :
modifier := sw.scanner.Next()
word, err := sw.processStopOn('}')
if err != nil {
@ -207,16 +201,16 @@ func (sw *shellWord) processName() string {
// If it starts with a numeric then just return $#
var name string
for sw.pos < len(sw.word) {
ch := sw.peek()
for sw.scanner.Peek() != scanner.EOF {
ch := sw.scanner.Peek()
if len(name) == 0 && unicode.IsDigit(ch) {
ch = sw.next()
ch = sw.scanner.Next()
return string(ch)
}
if !unicode.IsLetter(ch) && !unicode.IsDigit(ch) && ch != '_' {
break
}
ch = sw.next()
ch = sw.scanner.Next()
name += string(ch)
}

View File

@ -15,7 +15,7 @@ func TestShellParser(t *testing.T) {
defer file.Close()
scanner := bufio.NewScanner(file)
envs := []string{"PWD=/home", "SHELL=bash"}
envs := []string{"PWD=/home", "SHELL=bash", "KOREAN=한국어"}
for scanner.Scan() {
line := scanner.Text()

View File

@ -56,3 +56,57 @@ he${PWD:=000}xx | error
he${PWD:+${PWD}:}xx | he/home:xx
he${XXX:-\$PWD:}xx | he$PWD:xx
he${XXX:-\${PWD}z}xx | he${PWDz}xx
안녕하세요 | 안녕하세요
안'녕'하세요 | 안녕하세요
안'녕하세요 | 안녕하세요
안녕\'하세요 | 안녕'하세요
안\\'녕하세요 | 안\녕하세요
안녕\t하세요 | 안녕t하세요
"안녕\t하세요" | 안녕\t하세요
'안녕\t하세요 | 안녕\t하세요
안녕하세요\ | 안녕하세요
안녕하세요\\ | 안녕하세요\
"안녕하세요 | 안녕하세요
"안녕하세요\" | 안녕하세요"
"안녕'하세요" | 안녕'하세요
'안녕하세요 | 안녕하세요
'안녕하세요\' | 안녕하세요\
안녕$1x | 안녕x
안녕$.x | 안녕$.x
안녕$pwd. | 안녕.
안녕$PWD | 안녕/home
안녕\$PWD | 안녕$PWD
안녕\\$PWD | 안녕\/home
안녕\${} | 안녕${}
안녕\${}xx | 안녕${}xx
안녕${} | 안녕
안녕${}xx | 안녕xx
안녕${hi} | 안녕
안녕${hi}xx | 안녕xx
안녕${PWD} | 안녕/home
안녕${.} | error
안녕${XXX:-000}xx | 안녕000xx
안녕${PWD:-000}xx | 안녕/homexx
안녕${XXX:-$PWD}xx | 안녕/homexx
안녕${XXX:-${PWD:-yyy}}xx | 안녕/homexx
안녕${XXX:-${YYY:-yyy}}xx | 안녕yyyxx
안녕${XXX:YYY} | error
안녕${XXX:+${PWD}}xx | 안녕xx
안녕${PWD:+${XXX}}xx | 안녕xx
안녕${PWD:+${SHELL}}xx | 안녕bashxx
안녕${XXX:+000}xx | 안녕xx
안녕${PWD:+000}xx | 안녕000xx
'안녕${XX}' | 안녕${XX}
"안녕${PWD}" | 안녕/home
"안녕'$PWD'" | 안녕'/home'
'"안녕"' | "안녕"
안녕\$PWD | 안녕$PWD
"안녕\$PWD" | 안녕$PWD
'안녕\$PWD' | 안녕\$PWD
안녕${PWD | error
안녕${PWD:=000}xx | error
안녕${PWD:+${PWD}:}xx | 안녕/home:xx
안녕${XXX:-\$PWD:}xx | 안녕$PWD:xx
안녕${XXX:-\${PWD}z}xx | 안녕${PWDz}xx
$KOREAN | 한국어
안녕$KOREAN | 안녕한국어