提问人:Emulebest 提问时间:9/27/2023 最后编辑:Emulebest 更新时间:9/28/2023 访问量:67
我无法确定为什么在 Zig 中使用递归标记联合时内存会损坏
I can't pinpoint why memory gets corrupted while working with Recursive Tagged Union in Zig
问:
我正在开发一个功能不完整的小型 Lisp 解释器来学习一些 Zig。我的灵感来自这个 https://norvig.com/lispy.html,这可能不是在 C/Zig 中实现它的最惯用的方式,但仍然非常小而简单,这就是我正在寻找的。我现在站在通过标记工会代表 AST 的舞台上。但是,我无法理解为什么下面的代码似乎最终使我处于损坏/覆盖的内存状态。
const std = @import("std");
// const test_string = "(begin (define r 10) (* pi (* r r)))";
const shorter_test_string = "(begin (define r 10))";
const Allocator = std.mem.Allocator;
const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
Expr: *std.ArrayList(*Token),
Single: *Type,
};
pub fn finalizeWord(string_collector: *std.ArrayList(u8), outer_list: *std.ArrayList(std.ArrayList(u8))) !void {
if (!std.mem.eql(u8, string_collector.items, "")) {
try outer_list.append(try string_collector.clone());
string_collector.clearRetainingCapacity();
}
}
// TODO: I commented out all the deallocations for debugging purposes, will add them later
pub fn tokenize(alloca: *Allocator, parsed_input: *std.ArrayList(std.ArrayList(u8))) !*Token {
var token: std.ArrayList(u8) = parsed_input.orderedRemove(0);
if (std.mem.eql(u8, token.items, "(")) {
std.debug.print("Open bracket encountered \n", .{});
// defer token.deinit();
var list = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
while (!std.mem.eql(u8, parsed_input.items[0].items, ")")) {
std.debug.print("While loop iteration \n", .{});
var item_to_add = try tokenize(alloca, parsed_input);
try list.append(item_to_add);
}
_ = parsed_input.orderedRemove(0);
// defer closing_bracket.deinit();
return local_token;
} else if (std.mem.eql(u8, token.items, ")")) {
// TODO: This is definitely not unreachable but a Syntax Error
unreachable;
} else {
var item = try moveToHeap(alloca, Token{ .Single = try atomize(alloca, token) });
std.debug.print("Single item encountered: {s} \n", .{item.*.Single.*.String.items});
return item;
}
}
pub fn atomize(alloca: *Allocator, item: std.ArrayList(u8)) !*Type {
return try moveToHeap(alloca, Type{ .String = try item.clone() });
}
pub fn parse(alloca: *Allocator, input_string: []const u8) !std.ArrayList(std.ArrayList(u8)) {
var outer_list = std.ArrayList(std.ArrayList(u8)).init(alloca.*);
var string_collector = std.ArrayList(u8).init(alloca.*);
defer string_collector.deinit();
for (input_string) |character| {
if (character == '(') {
try finalizeWord(&string_collector, &outer_list);
var opening_bracket = std.ArrayList(u8).init(alloca.*);
try opening_bracket.append('(');
try outer_list.append(opening_bracket);
continue;
} else if (character == ')') {
try finalizeWord(&string_collector, &outer_list);
var closing_bracket = std.ArrayList(u8).init(alloca.*);
try closing_bracket.append(')');
try outer_list.append(closing_bracket);
continue;
} else if (character == ' ') {
try finalizeWord(&string_collector, &outer_list);
} else {
try string_collector.append(character);
}
}
return outer_list;
}
pub fn moveToHeap(allocator: *Allocator, value: anytype) !*@TypeOf(value) {
const T = @TypeOf(value);
std.debug.assert(@typeInfo(T) != .Pointer);
const ptr = try allocator.create(T);
ptr.* = value;
return ptr;
}
pub fn print_tokens(token: *Token) void {
switch (token.*) {
.Expr => |expr| {
// TODO: deallocations should happen in separate functions
// defer expr.deinit();
std.debug.print("Processing Expression \n", .{});
for (expr.items) |t| {
print_tokens(t);
}
},
.Single => |item| {
// TODO: deallocations should happen in separate functions
// defer item.*.String.deinit();
std.debug.print("Processing Token: {s}\n", .{item.*.String.items});
},
}
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer {
_ = gpa.deinit();
}
var allocator = gpa.allocator();
var parsed_string = try parse(&allocator, shorter_test_string);
defer parsed_string.deinit();
defer {
for (parsed_string.items) |item| {
item.deinit();
}
}
var tokens = try tokenize(&allocator, &parsed_string);
print_tokens(tokens); // SEGFAULT happens here
}
我正在努力解决的特定部分是函数和 .如果你在打印令牌时运行代码,你会得到SEGFAULTS,这似乎是因为一旦你退出函数,一些嵌套的内存就会被损坏/覆盖。我有一种直觉,这种表现形式tokenize
print_tokens
main
tokenize
Expr
const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
Expr: *std.ArrayList(*Token),
Single: *Type,
};
可能有一些固有的设计问题,但在这一点上,我想知道内存覆盖的真正原因是什么,因为一切都是手动放置在堆上的。
我尝试使用LLDB进行调试,似乎甚至在这里放置或进行堆分配print
_ = parsed_input.orderedRemove(0);
std.debug.print("Sample print", .{});
// defer closing_bracket.deinit();
return local_token;
覆盖 中的内部内容。正如我之前提到的,我现在更感兴趣的是为什么会出现内存问题,然后在最终的 Lisp 解释器实现:)Expr
local_token.*.Expr.*.items[1]
编辑:
似乎 Zig 类型的系统有点欺骗了我,让我以为它比我想象的更神奇。 是一个在堆上分配了项目的结构,问题是我在堆分配的联合中存储了指向本地堆栈分配结构的指针std.ArrayList
list
local_token
除了评论中建议的解决方案之外,我发现另一种解决方案是存储在 中,使其成为表示 ArrayList 的结构体的事实上的所有者。std.ArrayList(*Token)
Expr
const Token = union(TokenType) {
Expr: std.ArrayList(*Token),
Single: Type,
};
和功能:tokenize
// TODO: I commented out all the deallocations for debugging purposes, will add them later
pub fn tokenize(alloca: *Allocator, parsed_input: *std.ArrayList(std.ArrayList(u8))) !*Token {
var token: std.ArrayList(u8) = parsed_input.orderedRemove(0);
if (std.mem.eql(u8, token.items, "(")) {
std.debug.print("Open bracket encountered \n", .{});
// defer token.deinit();
var local_token = try moveToHeap(alloca, Token{ .Expr = std.ArrayList(*Token).init(alloca.*) });
while (!std.mem.eql(u8, parsed_input.items[0].items, ")")) {
std.debug.print("While loop iteration \n", .{});
var item_to_add = try tokenize(alloca, parsed_input);
try local_token.*.Expr.append(item_to_add);
}
_ = parsed_input.orderedRemove(0);
// defer closing_bracket.deinit();
return local_token;
} else if (std.mem.eql(u8, token.items, ")")) {
// TODO: This is definitely not unreachable but a Syntax Error
unreachable;
} else {
var item = try moveToHeap(alloca, Token{ .Single = try atomize(token) });
std.debug.print("Single item encountered: {s} \n", .{item.*.Single.String.items});
return item;
}
}
编辑2:
显然,我完全不需要额外的堆分配。我担心我最终会得到大小不正确的递归数据结构,但使用存储在其中的 ArrayList 已经给了它正确的布局,因此这段代码会更理想:Token
const Token = union(TokenType) {
Expr: std.ArrayList(Token),
Single: Type,
};
答:
问题在于如何在以下位置创建表达式列表:tokenize
var list = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
您存储了指向 的指针,但它位于函数的堆栈中。退出后,指针将失效。只需使用分配器创建列表即可。例如:list
tokenize
var list = try alloca.create(std.ArrayList(*Token));
list.* = std.ArrayList(*Token).init(alloca.*);
var local_token = try moveToHeap(alloca, Token{ .Expr = list });
评论
std.ArrayList(*Token).init(alloca.*);
init
返回写入函数堆栈的值。从概念上讲,这与做 .var a: i32 = 0
评论