上一篇《ChatGPT|AI自制编程语言-从0开始》…
本文是《AI自制编程语言》系列词法解析部分,功能是将源代码中的字符序列转换为单词(Token)序列,实现主要分为如下步骤:
-
定义词法单元:根据编程语言的语法规则,定义各种词法单元的正则表达式,例如标识符、关键字、运算符、常量等; -
读入源代码:从源文件或输入流中读入源代码,逐个字符进行处理; -
扫描词法单元:通过一个一个字符扫描,将源代码中的字符序列转换为词法单元; -
生成Token序列:将识别出的词法单元转换为Token序列,每个Token包含词法单元的类型和值; -
返回Token序列:将生成的Token序列返回给编译器的语法分析器(Parser)进行进一步处理;
1、定义词法单元
先来一个《AI自制编程语言》代码的例子:
let five=5;
let ten =10;
let add = fn(x, y){
x+y;
};
let result = add(five, ten);
function fib(a) {
if (a <= 1) {
return a;
}
return fib(a - 1) + fib(a - 2);
}
(1)定义token
基于上述例子的考虑,定义如下token:
&&,*,*=,`,!,case,:,,,CONST,~=,DEFAULT,DEFINE_FUNCTION,..,ELSE,EOF,==,FALSE,FLOAT,FOR,FOREACH,FUNCTION,>,>=,IDENT,IF,ILLEGAL,IN,INT,{,[,LET,(,<,<=,-,-=,--,%,!~,!=,null,||,.,+,+=,++,**,?,},],REGEXP,RETURN,),;,/,/=,STRING,switch,TRUE
(2)定义关键词
同时考虑一些通用关键词,定义关键词如下:
case
const
default
else
false
fn
for
foreach
function
if
in
let
null
return
switch
true
2、定义Prompt
确定了词法单元后,开始写代码,其中定义Prompt如下。
你是一个使用golang开发的资深的程序员,正在开发一个新的语言的词法lexer,需要如下要求:
### 支持Token列表如下
AND = "&&"
ASSIGN = "="
ASTERISK = "*"
ASTERISK_EQUALS = "*="
BACKTICK = "`"
BANG = "!"
CASE = "case"
COLON = ":"
COMMA = ","
CONST = "CONST"
CONTAINS = "~="
DEFAULT = "DEFAULT"
DEFINE_FUNCTION = "DEFINE_FUNCTION"
DOTDOT = ".."
ELSE = "ELSE"
EOF = "EOF"
EQ = "=="
FALSE = "FALSE"
FLOAT = "FLOAT"
FOR = "FOR"
FOREACH = "FOREACH"
FUNCTION = "FUNCTION"
GT = ">"
GT_EQUALS = ">="
IDENT = "IDENT"
IF = "IF"
ILLEGAL = "ILLEGAL"
IN = "IN"
INT = "INT"
LBRACE = "{"
LBRACKET = "["
LET = "LET"
LPAREN = "("
LT = "<"
LT_EQUALS = "<="
MINUS = "-"
MINUS_EQUALS = "-="
MINUS_MINUS = "--"
MOD = "%"
NOT_CONTAINS = "!~"
NOT_EQ = "!="
NULL = "null"
OR = "||"
PERIOD = "."
PLUS = "+"
PLUS_EQUALS = "+="
PLUS_PLUS = "++"
POW = "**"
QUESTION = "?"
RBRACE = "}"
RBRACKET = "]"
REGEXP = "REGEXP"
RETURN = "RETURN"
RPAREN = ")"
SEMICOLON = ";"
SLASH = "/"
SLASH_EQUALS = "/="
STRING = "STRING"
SWITCH = "switch"
TRUE = "TRUE"
### 支持关键词如下
"case": CASE
"const": CONST
"default": DEFAULT
"else": ELSE
"false": FALSE
"fn": FUNCTION
"for": FOR
"foreach": FOREACH
"function": DEFINE_FUNCTION
"if": IF
"in": IN
"let": LET
"null": NULL
"return": RETURN
"switch": SWITCH
"true": TRUE
### lexer的入口函数要求是输入是字符串,输出是token的结构体
### 给出测试用例和main函数
3、生成代码
基于上述的Prompt,生成代码如下:
package main
import (
"fmt"
"strings"
)
type TokenType string
const (
AND TokenType = "AND"
ASSIGN TokenType = "ASSIGN"
ASTERISK TokenType = "ASTERISK"
ASTERISK_EQUALS TokenType = "ASTERISK_EQUALS"
BACKTICK TokenType = "BACKTICK"
BANG TokenType = "BANG"
CASE TokenType = "CASE"
COLON TokenType = "COLON"
COMMA TokenType = "COMMA"
CONST TokenType = "CONST"
CONTAINS TokenType = "CONTAINS"
DEFAULT TokenType = "DEFAULT"
DEFINE_FUNCTION TokenType = "DEFINE_FUNCTION"
DOTDOT TokenType = "DOTDOT"
ELSE TokenType = "ELSE"
EOF TokenType = "EOF"
EQ TokenType = "EQ"
FALSE TokenType = "FALSE"
FLOAT TokenType = "FLOAT"
FOR TokenType = "FOR"
FOREACH TokenType = "FOREACH"
FUNCTION TokenType = "FUNCTION"
GT TokenType = "GT"
GT_EQUALS TokenType = "GT_EQUALS"
IDENT TokenType = "IDENT"
IF TokenType = "IF"
ILLEGAL TokenType = "ILLEGAL"
IN TokenType = "IN"
INT TokenType = "INT"
LBRACE TokenType = "LBRACE"
LBRACKET TokenType = "LBRACKET"
LET TokenType = "LET"
LPAREN TokenType = "LPAREN"
LT TokenType = "LT"
LT_EQUALS TokenType = "LT_EQUALS"
MINUS TokenType = "MINUS"
MINUS_EQUALS TokenType = "MINUS_EQUALS"
MINUS_MINUS TokenType = "MINUS_MINUS"
MOD TokenType = "MOD"
NOT_CONTAINS TokenType = "NOT_CONTAINS"
NOT_EQ TokenType = "NOT_EQ"
NULL TokenType = "NULL"
OR TokenType = "OR"
PERIOD TokenType = "PERIOD"
PLUS TokenType = "PLUS"
PLUS_EQUALS TokenType = "PLUS_EQUALS"
PLUS_PLUS TokenType = "PLUS_PLUS"
POW TokenType = "POW"
QUESTION TokenType = "QUESTION"
RBRACE TokenType = "RBRACE"
RBRACKET TokenType = "RBRACKET"
REGEXP TokenType = "REGEXP"
RETURN TokenType = "RETURN"
RPAREN TokenType = "RPAREN"
SEMICOLON TokenType = "SEMICOLON"
SLASH TokenType = "SLASH"
SLASH_EQUALS TokenType = "SLASH_EQUALS"
STRING TokenType = "STRING"
SWITCH TokenType = "SWITCH"
TRUE TokenType = "TRUE"
)
type Token struct {
Type TokenType
Value string
}
func Lexer(input string) []Token {
var tokens []Token
var currentToken string
for i := 0; i < len(input); i++ {
switch input[i] {
case '&':
if i+1 < len(input) && input[i+1] == '&' {
tokens = append(tokens, Token{Type: AND, Value: "&&"})
i++
}
case '=':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: EQ, Value: "=="})
i++
} else {
tokens = append(tokens, Token{Type: ASSIGN, Value: "="})
}
case '*':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: ASTERISK_EQUALS, Value: "*="})
i++
} else if i+1 < len(input) && input[i+1] == '*' {
tokens = append(tokens, Token{Type: POW, Value: "**"})
i++
} else {
tokens = append(tokens, Token{Type: ASTERISK, Value: "*"})
}
case '`':
tokens = append(tokens, Token{Type: BACKTICK, Value: "`"})
case '!':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: NOT_EQ, Value: "!="})
i++
} else {
tokens = append(tokens, Token{Type: BANG, Value: "!"})
}
case ':':
tokens = append(tokens, Token{Type: COLON, Value: ":"})
case ',':
tokens = append(tokens, Token{Type: COMMA, Value: ","})
case '{':
tokens = append(tokens, Token{Type: LBRACE, Value: "{"})
case '[':
tokens = append(tokens, Token{Type: LBRACKET, Value: "["})
case '(':
tokens = append(tokens, Token{Type: LPAREN, Value: "("})
case '>':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: GT_EQUALS, Value: ">="})
i++
} else {
tokens = append(tokens, Token{Type: GT, Value: ">"})
}
case '<':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: LT_EQUALS, Value: "<="})
i++
} else {
tokens = append(tokens, Token{Type: LT, Value: "<"})
}
case '-':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: MINUS_EQUALS, Value: "-="})
i++
} else if i+1 < len(input) && input[i+1] == '-' {
tokens = append(tokens, Token{Type: MINUS_MINUS, Value: "--"})
i++
} else {
tokens = append(tokens, Token{Type: MINUS, Value: "-"})
}
case '%':
tokens = append(tokens, Token{Type: MOD, Value: "%"})
case '~':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: NOT_CONTAINS, Value: "!~"})
i++
}
case '|':
if i+1 < len(input) && input[i+1] == '|' {
tokens = append(tokens, Token{Type: OR, Value: "||"})
i++
}
case '.':
tokens = append(tokens, Token{Type: PERIOD, Value: "."})
case '+':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: PLUS_EQUALS, Value: "+="})
i++
} else if i+1 < len(input) && input[i+1] == '+' {
tokens = append(tokens, Token{Type: PLUS_PLUS, Value: "++"})
i++
} else {
tokens = append(tokens, Token{Type: PLUS, Value: "+"})
}
case '?':
tokens = append(tokens, Token{Type: QUESTION, Value: "?"})
case '}':
tokens = append(tokens, Token{Type: RBRACE, Value: "}"})
case ']':
tokens = append(tokens, Token{Type: RBRACKET, Value: "]"})
case ')':
tokens = append(tokens, Token{Type: RPAREN, Value: ")"})
case ';':
tokens = append(tokens, Token{Type: SEMICOLON, Value: ";"})
case '/':
if i+1 < len(input) && input[i+1] == '=' {
tokens = append(tokens, Token{Type: SLASH_EQUALS, Value: "/="})
i++
} else {
tokens = append(tokens, Token{Type: SLASH, Value: "/"})
}
case '"':
j := i + 1
for ; j < len(input); j++ {
if input[j] == '"' {
break
}
}
if j < len(input) {
tokens = append(tokens, Token{Type: STRING, Value: input[i+1 : j]})
i = j
}
default:
if isLetter(input[i]) {
j := i + 1
for ; j < len(input); j++ {
if !isLetter(input[j]) && !isDigit(input[j]) {
break
}
}
currentToken = input[i:j]
if isKeyword(currentToken) {
tokens = append(tokens, Token{Type: getKeywordTokenType(currentToken), Value: currentToken})
} else {
tokens = append(tokens, Token{Type: IDENT, Value: currentToken})
}
i = j - 1
} else if isDigit(input[i]) {
j := i + 1
for ; j < len(input); j++ {
if !isDigit(input[j]) && input[j] != '.' {
break
}
}
currentToken = input[i:j]
tokens = append(tokens, Token{Type: INT, Value: currentToken})
i = j - 1
}
}
}
tokens = append(tokens, Token{Type: EOF, Value: ""})
return tokens
}
func isLetter(ch byte) bool {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'
}
func isDigit(ch byte) bool {
return ch >= '0' && ch <= '9'
}
func isKeyword(token string) bool {
keywords := map[string]bool{
"case": true,
"const": true,
"default": true,
"else": true,
"false": true,
"fn": true,
"for": true,
"foreach": true,
"function": true,
"if": true,
"in": true,
"let": true,
"null": true,
"return": true,
"switch": true,
"true": true,
}
_, ok := keywords[token]
return ok
}
func getKeywordTokenType(token string) TokenType {
switch token {
case "case":
return CASE
case "const":
return CONST
case "default":
return DEFAULT
case "else":
return ELSE
case "false":
return FALSE
case "fn":
return FUNCTION
case "for":
return FOR
case "foreach":
return FOREACH
case "function":
return DEFINE_FUNCTION
case "if":
return IF
case "in":
return IN
case "let":
return LET
case "null":
return NULL
case "return":
return RETURN
case "switch":
return SWITCH
case "true":
return TRUE
default:
return IDENT
}
}
(注意:由于输出的代码比较长,如果遇到输出中断,可以输入”继续”,如果您的GPT的token限制,可以切换为16K或者32K的版本)
为了方便测试,我又加了一些测试用例:
// 增加的测试用例
func main() {
inputs := []string{
`
let x = 10;
if (x > 5) {
fmt.Println("x is greater than 5")
} else {
fmt.Println("x is less than or equal to 5")
}
`,
`
let five=5;
let ten =10;
let add = fn(x, y){
x+y;
};
let result = add(five, ten);
function fib(a) {
if (a <= 1) {
return a;
}
return fib(a - 1) + fib(a - 2);
}
!- *5;
5<10>5;
if(5<10){
return true;
}else{
return false;
}
10 == 10;
10 != 9;
"foobar"
"foo bar"
[1,2];
{"foo":"bar"}
1.2
0.5
0.3
世界
for
2 >= 1
1 <= 3
`,
}
for _, input := range inputs {
fmt.Printf("input: %v", input)
tokens := Lexer(input)
for _, token := range tokens {
fmt.Printf("Type: %s, Value: %sn", token.Type, token.Value)
}
}
}
4、运行代码
将生成的代码在https://go.dev/play/
上运行,获得输出结果(在未整合代码之前,输出的结构体是正常的)。
input: let x = 10;
if (x > 5) {
fmt.Println("x is greater than 5")
} else {
fmt.Println("x is less than or equal to 5")
}Type: LET, Value: let
Type: IDENT, Value: x
Type: ASSIGN, Value: =
Type: INT, Value: 10
Type: SEMICOLON, Value: ;
Type: IF, Value: if
Type: LPAREN, Value: (
Type: IDENT, Value: x
Type: GT, Value: >
Type: INT, Value: 5
Type: RPAREN, Value: )
Type: LBRACE, Value: {
Type: IDENT, Value: fmt
Type: PERIOD, Value: .
Type: IDENT, Value: Println
Type: LPAREN, Value: (
Type: STRING, Value: x is greater than 5
Type: RPAREN, Value: )
Type: RBRACE, Value: }
Type: ELSE, Value: else
Type: LBRACE, Value: {
Type: IDENT, Value: fmt
Type: PERIOD, Value: .
Type: IDENT, Value: Println
Type: LPAREN, Value: (
Type: STRING, Value: x is less than or equal to 5
Type: RPAREN, Value: )
Type: RBRACE, Value: }
Type: EOF, Value:
input:
let five=5;
let ten =10;
let add = fn(x, y){
x+y;
};
function fib(a) {
if (a <= 1) {
return a;
}
return fib(a - 1) + fib(a - 2);
}
let result = add(five, ten);
!- *5;
5<10>5;
if(5<10){
return true;
}else{
return false;
}
10 == 10;
10 != 9;
"foobar"
"foo bar"
[1,2];
{"foo":"bar"}
1.2
0.5
0.3
世界
for
2 >= 1
1 <= 3
Type: LET, Value: let
Type: IDENT, Value: five
Type: ASSIGN, Value: =
Type: INT, Value: 5
Type: SEMICOLON, Value: ;
Type: LET, Value: let
Type: IDENT, Value: ten
Type: ASSIGN, Value: =
Type: INT, Value: 10
Type: SEMICOLON, Value: ;
Type: LET, Value: let
Type: IDENT, Value: add
Type: ASSIGN, Value: =
Type: FUNCTION, Value: fn
Type: LPAREN, Value: (
Type: IDENT, Value: x
Type: COMMA, Value: ,
Type: IDENT, Value: y
Type: RPAREN, Value: )
Type: LBRACE, Value: {
Type: IDENT, Value: x
Type: PLUS, Value: +
Type: IDENT, Value: y
Type: SEMICOLON, Value: ;
Type: RBRACE, Value: }
Type: SEMICOLON, Value: ;
Type: DEFINE_FUNCTION, Value: function
Type: IDENT, Value: fib
Type: LPAREN, Value: (
Type: IDENT, Value: a
Type: RPAREN, Value: )
Type: LBRACE, Value: {
Type: IF, Value: if
Type: LPAREN, Value: (
Type: IDENT, Value: a
Type: LT_EQUALS, Value: <=
Type: INT, Value: 1
Type: RPAREN, Value: )
Type: LBRACE, Value: {
Type: RETURN, Value: return
Type: IDENT, Value: a
Type: SEMICOLON, Value: ;
Type: RBRACE, Value: }
Type: RETURN, Value: return
Type: IDENT, Value: fib
Type: LPAREN, Value: (
Type: IDENT, Value: a
Type: MINUS, Value: -
Type: INT, Value: 1
Type: RPAREN, Value: )
Type: PLUS, Value: +
Type: IDENT, Value: fib
Type: LPAREN, Value: (
Type: IDENT, Value: a
Type: MINUS, Value: -
Type: INT, Value: 2
Type: RPAREN, Value: )
Type: SEMICOLON, Value: ;
Type: RBRACE, Value: }
Type: LET, Value: let
Type: IDENT, Value: result
Type: ASSIGN, Value: =
Type: IDENT, Value: add
Type: LPAREN, Value: (
Type: IDENT, Value: five
Type: COMMA, Value: ,
Type: IDENT, Value: ten
Type: RPAREN, Value: )
Type: SEMICOLON, Value: ;
Type: BANG, Value: !
Type: MINUS, Value: -
Type: ASTERISK, Value: *
Type: INT, Value: 5
Type: SEMICOLON, Value: ;
Type: INT, Value: 5
Type: LT, Value: <
Type: INT, Value: 10
Type: GT, Value: >
Type: INT, Value: 5
Type: SEMICOLON, Value: ;
Type: IF, Value: if
Type: LPAREN, Value: (
Type: INT, Value: 5
Type: LT, Value: <
Type: INT, Value: 10
Type: RPAREN, Value: )
Type: LBRACE, Value: {
Type: RETURN, Value: return
Type: TRUE, Value: true
Type: SEMICOLON, Value: ;
Type: RBRACE, Value: }
Type: ELSE, Value: else
Type: LBRACE, Value: {
Type: RETURN, Value: return
Type: FALSE, Value: false
Type: SEMICOLON, Value: ;
Type: RBRACE, Value: }
Type: INT, Value: 10
Type: EQ, Value: ==
Type: INT, Value: 10
Type: SEMICOLON, Value: ;
Type: INT, Value: 10
Type: NOT_EQ, Value: !=
Type: INT, Value: 9
Type: SEMICOLON, Value: ;
Type: STRING, Value: foobar
Type: STRING, Value: foo bar
Type: LBRACKET, Value: [
Type: INT, Value: 1
Type: COMMA, Value: ,
Type: INT, Value: 2
Type: RBRACKET, Value: ]
Type: SEMICOLON, Value: ;
Type: LBRACE, Value: {
Type: STRING, Value: foo
Type: COLON, Value: :
Type: STRING, Value: bar
Type: RBRACE, Value: }
Type: INT, Value: 1.2
Type: INT, Value: 0.5
Type: INT, Value: 0.3
Type: FOR, Value: for
Type: INT, Value: 2
Type: GT_EQUALS, Value: >=
Type: INT, Value: 1
Type: INT, Value: 1
Type: LT_EQUALS, Value: <=
Type: INT, Value: 3
Type: EOF, Value:
参考
(1)《用Go语言自制解析器》
(2)https://go.dev/play/
原文始发于微信公众号(周末程序猿):ChatGPT|AI自制编程语言-词法解析
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/169247.html