From d4ad39fad4399e5c9e3242060854c89c2543e890 Mon Sep 17 00:00:00 2001 From: zhangzhuang08 Date: Thu, 31 Oct 2024 16:31:52 +0800 Subject: [PATCH 1/2] blog: 1. improve simple coroutine implementation; 2. introduce difference between Sync & Send trait; --- docs/blog/coroutine.md | 293 +++++++++++++++++++++++++++++++++++++++++ docs/rust/question.md | 14 ++ 2 files changed, 307 insertions(+) diff --git a/docs/blog/coroutine.md b/docs/blog/coroutine.md index a986816..775bb92 100644 --- a/docs/blog/coroutine.md +++ b/docs/blog/coroutine.md @@ -403,4 +403,297 @@ void wrapper(Entry entry) { 然后协程的entry字段指向 wrapper。容我以后有时间再捣鼓捣鼓吧。 +## 优化1 +```C +#include +#include +#include + +#define STACK_SIZE 1024 + +typedef struct { + u_int64_t x0; + /** 栈顶寄存器 */ + u_int64_t sp; + /** 栈底寄存器 */ + u_int64_t x29; + /** 返回地址寄存器 */ + u_int64_t x30; + /** 协程结束,应该跳转到哪个地址 */ + u_int64_t endAddr; + u_int64_t endSp; + u_int64_t endX29; + u_int64_t endX30; + + uint64_t entry; + uint64_t stack; + size_t size; + volatile int dead; + volatile int start; +} Routine; + +Routine* queueZero(); +Routine* queueOne(); + +typedef void (*Entry)(); + +Routine* current; +Routine queue[2]; + +void switchToRoutine(); + +void markCurrentDead() { + current->dead = 1; +}; + +void wrapper(Entry entry) { + entry(); + markCurrentDead(); + printf("ehy\n"); + switchToRoutine(); +} + +void createRoutine(Entry entry) { + u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE); + if (stack == NULL) { + perror("Failed to allocate stack memory"); + exit(EXIT_FAILURE); + } + Routine r; + uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; // 16字节对齐 + r.sp = aligned_sp; + r.x29 = aligned_sp; + r.entry = (uint64_t)wrapper; + r.x30 = (uint64_t)wrapper; + r.size = STACK_SIZE - 1; + r.dead = 0; + r.stack = (uint64_t)stack; + r.x0 = (uint64_t)entry; + r.start = 0; + + queue[1] = r; +} + +void execute(); +void release(); +void mainRoutineEntry(); + +void createMainRoutine(Entry entry) { + u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE); + if (stack == NULL) { + perror("Failed to allocate stack memory"); + exit(EXIT_FAILURE); + } + Routine r; + uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; // 16字节对齐 + r.sp = aligned_sp; + r.x29 = aligned_sp; + r.entry = (uint64_t)entry; + r.x30 = (uint64_t)entry; + r.size = STACK_SIZE - 1; + r.dead = 0; + r.stack = (uint64_t)stack; + r.x0 = (uint64_t)entry; + + queue[0] = r; +} + +void create() { + createMainRoutine(mainRoutineEntry); + execute(); + release(); +} + +__attribute__((naked)) void switchToInitChildRoutine() { + __asm__ volatile( + "mov %0, sp\n\t" + "mov %1, x29\n\t" + "mov %2, x30\n\t" + : "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30) + : + :"memory" + ); + + __asm__ volatile( + "ldr x10, [%0]\n\t" + "mov sp, x10\n\t" + "ldr x29, [%1]\n\t" + "ldr x30, [%2]\n\t" + "ldr x0, [%3]\n\t" + "ret\n\t" + : + :"r"(¤t->sp), "r"(¤t->x29), "r"(¤t->x30), "r"(¤t->x0) + :"x10", "x30", "memory", "x0" + ); + +} + +__attribute__((naked)) void switchToChildRoutine() { + __asm__ volatile( + "mov %0, sp\n\t" + "mov %1, x29\n\t" + "mov %2, x30\n\t" + : "=r"(queue[0].sp), "=r"(queue[0].x29), "=r"(queue[0].x30) + : + :"memory" + ); + __asm__ volatile( + "mov x10, %0\n\t" + "mov sp, x10\n\t" + "mov x29, %1\n\t" + "mov x30, %2\n\t" + "ret\n\t" + : + :"r"(current->sp), "r"(current->x29), "r"(current->x30) + :"x10", "x30", "memory" + ); +} + +__attribute__((naked)) void switchToRoutine() { + __asm__ volatile( + "mov x1, sp\n\t" + "mov %0, x1\n\t" + "mov x1, x29\n\t" + "mov %1, x1\n\t" + "mov x1, x30\n\t" + "mov %2, x1\n\t" + : "=r"(current->sp), "=r"(current->x29), "=r"(current->x30) + : + :"memory", "x1" + ); + + __asm__ volatile( + "mov x10, %0\n\t" + "mov sp, x10\n\t" + "mov x29, %1\n\t" + "mov x30, %2\n\t" + "ret\n\t" + : + :"r"(queue[0].sp), "r"(queue[0].x29), "r"(queue[0].x30) + :"x10", "x30", "memory" + ); +} + +void hello() { + printf("hello\n"); + switchToRoutine(); + printf("world\n"); + switchToRoutine(); + printf("111\n"); +} + +void mainRoutineEntry() { + while(1) { + current = &queue[0]; + printf("enter main routine\n"); + Routine* b = &queue[1]; + if (b->dead == 1) { + printf("yes\n"); + __asm__ volatile( + "mov sp, %0\n\t" + "mov x29, %1\n\t" + "mov x30, %2\n\t" + "ret\n\t" + : + :"r"(current->endSp), "r"(current->endX29), "r"(current->endX30) + ); + } else { + current = b; + if (b->start == 0) { + b->start = 1; + switchToInitChildRoutine(); + } else { + switchToChildRoutine(); + } + } + } +}; + +__attribute__((naked)) void execute() { + __asm__ volatile( + "mov x2, sp\n\t" + "str x2, [%0]\n\t" + "str x29, [%1]\n\t" + "str x30, [%2]\n\t" + "ldr x2, [%3]\n\t" + "mov sp, x2\n\t" + "ldr x29, [%4]\n\t" + "ldr x30, [%5]\n\t" + "ret\n\t" + : + :"r"(&queue[0].endSp), "r"(&queue[0].endX29), "r"(&queue[0].endX30),"r"(&queue[0].sp), "r"(&queue[0].x29), "r"(&queue[0].x30) + :"x2", "x30", "memory" + ); +} + +void release() { + free((uint8_t*)(queue[0].stack)); + free((uint8_t*)(queue[1].stack)); +} + +int main() { + createRoutine(hello); + create(); + printf("ok, that's right\n"); + printf("wow\n"); + return 0; +} + +``` + +优化点: +- 只开放创建普通协程的函数`createRoutine` +- 调度协程的创建细节、以及所有协程内存资源的释放,由 `create` 函数隐藏 +- 普通协程内,只能使用`switchToRoutine`主动切换协程,同时感知到不到协程结构体 + +上述代码在改进过程中,遇到了内存异常访问的问题,比如bus错误,段错误。根据Stack Overflow上其他人的帮助([看这里](https://stackoverflow.com/questions/79141359/whats-wrong-with-my-implementation-of-c-style-coroutine)),我采用了一些解决方法。 + +对于`execute`函数,以前版本是纯正的C函数,但里边主要做的是上下文切换,最好是纯粹的汇编代码搞定,一般的C函数在经过编译后 +会在函数开头、结尾插入样板代码,比如调整sp寄存器,存储和恢复x29,x30寄存器,ret指令调用。为了不让C编译器搞这些事情,要加入 +`__attribute__((naked))`修饰,这样你在函数里内联了哪些汇编代码,编译后的结果就是这些汇编代码。要注意的是,加入这个修饰 +符号后,函数只能包含内联汇编,不能有C语言代码,但C的全局变量是可以使用的。 + +`execute`和`release`函数不能放入`createMainRoutine`中,要把三者平铺在一个新的函数,也就是`create`内。原因和缓存有关。 +`release`函数需要访问 queue,然后释放堆内存。如果放在 `createMainRoutine`函数的话,编译的时候,queue的值会存入到内存 +中,当接着`execute`函数保存的上下文结果,切换回`createMainRoutine`的时候,queue的值还是旧的,`queue[0].stack`不存在, +释放就会出错。解决方式是,不要在`createMainRoutine`创建的栈空间内执行释放,因此要从`createMainRoutine`中把release函数 +拎出来。至于 `execute` 函数,不受此影响,可以放在 `createMainRoutine`,也可以不用放在里边。 + +上述代码夹杂着c代码和汇编,采取默认优化程度(-O0)编译的话,存在问题,需要用-O1级别编译: +```shell +gcc main.c -O1 -o main +``` + +缺陷: +- 不支持多线程 + + +## 调试技巧 +调试前,需要`-g`编译代码: +```shell +gcc main.c -O1 -g -o main +``` + +### lldb 指令 +读取寄存器的值: +```shell +register read x30 +``` + +查看地址0x100003df8开始的汇编代码: +```shell +disassemble --start-address 0x100003df8 +``` + +查看符号wrapper所在的地址: +```shell +image lookup --symbol wrapper +``` + +给指令地址0x0000000100003cc8打断点: +```shell +b 0x0000000100003cc8 +``` +> 地址一定是十六进制表达,不能是十进制表达 + \ No newline at end of file diff --git a/docs/rust/question.md b/docs/rust/question.md index bf376c4..b44982b 100644 --- a/docs/rust/question.md +++ b/docs/rust/question.md @@ -1504,4 +1504,18 @@ fn main() { 在 ok 输出的时候,m其实就没再使用了,也就是死了,但是drop函数却是在 ok2 后边调用的! + +## Sync & Send +这两个Trait是语义Trait,实现它们的时候,不需要实现特定的方法。 + +实现它们仅仅是作为一种语义标记,开发者必须在实现代码的时候保证它们所具备的语义。 + +Send是指一个Rust类型可以安全地从一个线程传递到另一个线程。可能你会说,我们已经有所有权的保证了,线程传递一定是安全的。这可不一定。某些类型在跨线程传递的时候,存在副作用,这是不安全的。比如说Rc类型,你会使用Rc.clone生成一个新的Rc类型拷贝,然后将这个类型拷贝传递给另外一个线程,但这个Rc类型拷贝内部的计数器和原来的是共享的,计数器还不是原子操作,这就不安全了。它可以跨线程传递,但它不安全! + +简言之,如果你能保证你定义的类型,可以跨线程,安全地传递所有权,那么就可以声明实现了Send. + +Sync说的不是类型的所有权,说的是类型的不可变引用。如果一个Rust类型T,在多个线程可以安全使用&T,那么就可以为这个类型T声明实现了Sync。可能你会说,&T会受到借用保护检查,跨线程使用肯定是没问题的。这可不一定。如果是简单的结构体,这话没毛病。但是Rust类型体系中,有一类叫做”内部可变“的类型。你可以使用这种类型的不可变引用,获取到其内部元素的可变引用,如果没有锁保护,跨线程就会有问题。 + +这里有一个比较有趣的关系描述。如果T是Sync的,只能说明&T是Send的,T不一定是Send的。比如`Rc>`是Sync的,但不是Send的。 + \ No newline at end of file From c76b739775fe4b525baaeb9846cdf7fe337960cd Mon Sep 17 00:00:00 2001 From: zhangzhuang08 Date: Thu, 31 Oct 2024 22:58:46 +0800 Subject: [PATCH 2/2] blog: improve simple coroutine implementation with pure assembly code --- docs/blog/coroutine.md | 354 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 354 insertions(+) diff --git a/docs/blog/coroutine.md b/docs/blog/coroutine.md index 775bb92..148a7f3 100644 --- a/docs/blog/coroutine.md +++ b/docs/blog/coroutine.md @@ -668,6 +668,332 @@ gcc main.c -O1 -o main - 不支持多线程 +## 汇编纯享版 +```c +#include +#include +#include + +#define STACK_SIZE 1024 + +typedef struct { + u_int64_t x0; + /** 栈顶寄存器 */ + u_int64_t sp; + /** 栈底寄存器 */ + u_int64_t x29; + /** 返回地址寄存器 */ + u_int64_t x30; + /** 协程结束,应该跳转到哪个地址 */ + u_int64_t endAddr; + u_int64_t endSp; + u_int64_t endX29; + u_int64_t endX30; + + uint64_t entry; + uint64_t stack; + uint64_t size; + uint64_t dead; + uint64_t start; +} Routine; + +Routine* queueZero(); +Routine* queueOne(); + +typedef void (*Entry)(); + +Routine* current; +Routine queue[2]; + +void switchToRoutine(); + +void markCurrentDead() { + current->dead = 1; +}; + +__attribute__((naked)) void wrapper() { + __asm__ volatile( + "sub sp, sp, #16\n\t" + "str x29, [sp]\n\t" + "str x30, [sp, #8]\n\t" + "blr x0\n\t" + "bl _markCurrentDead\n\t" + "bl _switchToRoutine\n\t" + "ldr x29, [sp]\n\t" + "ldr x30, [sp, #8]\n\t" + "add sp, sp, #16\n\t" + "ret\n\t" + : + : + :"memory" + ); +} + +void createRoutine(Entry entry) { + u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE); + if (stack == NULL) { + perror("Failed to allocate stack memory"); + exit(EXIT_FAILURE); + } + Routine r; + uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; // 16字节对齐 + r.sp = aligned_sp; + r.x29 = aligned_sp; + r.entry = (uint64_t)wrapper; + r.x30 = (uint64_t)wrapper; + r.size = STACK_SIZE - 1; + r.dead = 0; + r.stack = (uint64_t)stack; + r.x0 = (uint64_t)entry; + r.start = 0; + + queue[1] = r; +} + +void execute(); +void release(); +void mainRoutineEntry(); + +void createMainRoutine(Entry entry) { + u_int8_t* stack = (u_int8_t*)malloc(STACK_SIZE); + if (stack == NULL) { + perror("Failed to allocate stack memory"); + exit(EXIT_FAILURE); + } + Routine r; + uint64_t aligned_sp = (uint64_t)(stack + STACK_SIZE - 1) & ~0xF; // 16字节对齐 + r.sp = aligned_sp; + r.x29 = aligned_sp; + r.entry = (uint64_t)entry; + r.x30 = (uint64_t)entry; + r.size = STACK_SIZE - 1; + r.dead = 0; + r.stack = (uint64_t)stack; + r.x0 = (uint64_t)entry; + + queue[0] = r; +} + +__attribute__((naked)) void create() { + __asm__ volatile( + "sub sp, sp, #16\n\t" + "str x29, [sp]\n\t" + "str x30, [sp, #8]\n\t" + "adrp x0, _mainRoutineEntry@PAGE\n\t" + "add x0, x0, _mainRoutineEntry@PAGEOFF\n\t" + "bl _createMainRoutine\n\t" + "bl _execute\n\t" + "bl _release\n\t" + "ldr x29, [sp]\n\t" + "ldr x30, [sp, #8]\n\t" + "add sp, sp, #16\n\t" + "ret\n\t" + ); +} + +__attribute__((naked)) void switchToInitChildRoutine_1() { + __asm__ volatile( + "mov x10, sp\n\t" + "str x10, [x0, #8]\n\t" + "str x29, [x0, #16]\n\t" + "str x30, [x0, #24]\n\t" + "ldr x10, [x1, #8]\n\t" + "mov sp, x10\n\t" + "ldr x29, [x1, #16]\n\t" + "ldr x30, [x1, #24]\n\t" + "ldr x0, [x1]\n\t" + "ret\n\t" + : + : + :"memory", "x10", "x30", "x0", "x1" + ); +} + +__attribute__((naked)) void switchToChildRoutine_1() { + __asm__ volatile( + "mov x10, sp\n\t" + "str x10, [x0, #8]\n\t" + "str x29, [x0, #16]\n\t" + "str x30, [x0, #24]\n\t" + "ldr x10, [x1, #8]\n\t" + "mov sp, x10\n\t" + "ldr x29, [x1, #16]\n\t" + "ldr x30, [x1, #24]\n\t" + "ret\n\t" + : + : + :"memory","x10", "x30" + ); +} + +__attribute__((naked)) void switchToRoutine_1() { + __asm__ volatile( + // 保存当前上下文 + "mov x2, sp\n\t" + "str x2, [x0, #8]\n\t" + "str x29, [x0, #16]\n\t" + "str x30, [x0, #24]\n\t" + // 加载主协程上下文 + "ldr x2, [x1, #8]\n\t" + "mov sp, x2\n\t" + "ldr x29, [x1, #16]\n\t" + "ldr x30, [x1, #24]\n\t" + "ret\n\t" + : + : + :"x2", "memory", "x1" + ); +} + +__attribute__((naked)) void switchToRoutine() { + __asm__ volatile( + "sub sp, sp, #16\n\t" + "str x29, [sp]\n\t" + "str x30, [sp, #8]\n\t" + "bl _getMain\n\t" + "mov x1, x0\n\t" + "bl _getCurrent\n\t" + "ldr x29, [sp]\n\t" + "ldr x30, [sp, #8]\n\t" + "add sp, sp, #16\n\t" + "b _switchToRoutine_1\n\t" + : + : + :"x0","x1" + ); +} + + +void hello() { + printf("hello\n"); + switchToRoutine(); + printf("world\n"); + switchToRoutine(); + printf("111\n"); +} + +uint64_t getMain() { + return (uint64_t)&queue[0]; +} + +uint64_t getCurrent() { + return (uint64_t)¤t->x0; +} + +uint64_t choose() { + if (queue[1].dead == 0) { + current = &queue[1]; + return (uint64_t)&queue[1]; + } + current = &queue[0]; + return 0; +} + +__attribute__((naked)) void mainRoutineEntry() { + __asm__ volatile( + "get_start: \n\t" + "bl _choose\n\t" + "cbz x0, 1f\n\t" + "ldr x1, [x0, #96]\n\t" + "cbz x1, 2f\n\t" + "mov x1, x0\n\t" + "bl _getMain\n\t" + "bl _switchToChildRoutine_1\n\t" + "b get_start\n\t" + "2: \n\t" + "mov x1, #1\n\t" + "str x1, [x0, #96]\n\t" + "mov x1, x0\n\t" + "bl _getMain\n\t" + "bl _switchToInitChildRoutine_1\n\t" + "b get_start\n\t" + "1: \n\t" + "bl _getMain\n\t" + "ldr x10, [x0, #40]\n\t" + "mov sp, x10\n\t" + "ldr x29, [x0, #48]\n\t" + "ldr x30, [x0, #56]\n\t" + "ret\n\t" + ); +} + +__attribute__((naked)) void execute() { + __asm__ volatile( + "mov x2, sp\n\t" + "mov x3, x30\n\t" + "bl _getMain\n\t" + "str x2, [x0, #40]\n\t" + "str x29, [x0, #48]\n\t" + "str x3, [x0, #56]\n\t" + "ldr x2, [x0, #8]\n\t" + "mov sp, x2\n\t" + "ldr x29, [x0, #16]\n\t" + "ldr x30, [x0, #24]\n\t" + "ret\n\t" + : + : + :"x2","memory","x3" + ); +} + +void release() { + free((uint8_t*)(queue[0].stack)); + free((uint8_t*)(queue[1].stack)); +} + +int main() { + createRoutine(hello); + + create(); + + printf("ok, that's right\n"); + printf("wow\n"); + + return 0; +} +``` + +使用`-O0`编译后,结果可以运行,但是不稳定,有的时候成功,有的时候失败,报如下的错误: +```txt +malloc: Region cookie corrupted for region 0x139800000 (value is 0)[0x1398081fc] +``` +在上一个版本中,如果用`-O0`编译,一定会报错。 + +内联汇编中,要引入的C数据较多时,生成的汇编代码中会有非常多的寄存器,寄存器越多,就会引入寄存器之间的污染,导致异常的结果。因此,这个版本做了改进,能用汇编代码的地方,全都改用汇编代码。 + +这里说一下循环语句怎么用汇编写出来: +```c +__attribute__((naked)) void mainRoutineEntry() { + __asm__ volatile( + "get_start: \n\t" + "bl _choose\n\t" + "cbz x0, 1f\n\t" + "ldr x1, [x0, #96]\n\t" + "cbz x1, 2f\n\t" + "mov x1, x0\n\t" + "bl _getMain\n\t" + "bl _switchToChildRoutine_1\n\t" + "b get_start\n\t" + "2: \n\t" + "mov x1, #1\n\t" + "str x1, [x0, #96]\n\t" + "mov x1, x0\n\t" + "bl _getMain\n\t" + "bl _switchToInitChildRoutine_1\n\t" + "b get_start\n\t" + "1: \n\t" + "bl _getMain\n\t" + "ldr x10, [x0, #40]\n\t" + "mov sp, x10\n\t" + "ldr x29, [x0, #48]\n\t" + "ldr x30, [x0, #56]\n\t" + "ret\n\t" + ); +} +``` + +开头的标签可以随意命名,比如`get_start`,内部的标签,在定义的时候,用数字表示,在跳转的时候,要使用数字+f, 比如 1f 2f. + ## 调试技巧 调试前,需要`-g`编译代码: ```shell @@ -696,4 +1022,32 @@ b 0x0000000100003cc8 ``` > 地址一定是十六进制表达,不能是十进制表达 + +展示当前的汇编代码: +```shell +disassemble +``` + +单步运行汇编指令(step over): +```shell +ni +``` + +单步运行汇编指令(step in): +```shell +si +``` + +继续执行: +```shell +c +``` + +读取某个地址附近的数据: +```shell +memory read 0x1000 --size 16 +``` +> 读取0x1000开始的16个字节 + + \ No newline at end of file