【施工完成】CSAPP archlab

背景

CSAPP:3e第四章配套的实验。 第四章是讲处理器架构的,章节的重点是实现一个六阶段流水线。

lab的内容也是,需要实现一个Y86-64的流水线,并进行性能调优。

准备工作

材料里面sim.tar解压之后,有可能是没办法编译通过的。 原因是tk(a windowing toolkit )在8.5版本的时候废弃了某个接口,与8.6版本不兼容。

因此需要安装tk 8.5或者更低的版本。 在archlinux/manjaro下可以通过aur安装tk85.

use tk<=8.5,8.6 or above will not compile

Modify the corresponding header file in seq/ssim.c and pipe/psim.c (assmue tk8.5 header file is located in /usr/local/tk8.5)

1
2#ifdef HAS_GUI
3#include <tk8.5/tk.h>  // #include <tk/tk.h>
4#endif /* HAS_GUI */
5
6
7

modify Makefile in sim

 1
 2
 3
 4# Comment this out if you don't have Tcl/Tk on your system
 5
 6GUIMODE=-DHAS_GUI 
 7
 8# Modify the following line so that gcc can find the libtcl.so and
 9# libtk.so libraries on your system. You may need to use the -L option
10# to tell gcc which directory to look in. Comment this out if you
11# don't have Tcl/Tk.
12
13TKLIBS=-L/usr/lib/tcl8.5/ -ltk -ltcl
14
15# Modify the following line so that gcc can find the tcl.h and tk.h
16# header files on your system. Comment this out if you don't have
17# Tcl/Tk.
18
19TKINC=-isystem /usr/include/tcl8.5/
20
21
22

comment matherr in seq/ssim.c and pipe/psim.c

1
2/* Hack for SunOS */
3// extern int matherr();
4// int *tclDummyMathPtr = (int *) matherr;
5
6
7

PART A

Iteratively sum linked list elements

要求写一段y86-64的汇编代码,实现对一个链表元素求和,也就是如下c语言的相同功能代码。

 1
 2/* sum_list - Sum the elements of a linked list */
 3long sum_list(list_ptr ls)
 4{
 5    long val = 0;
 6    while (ls) {
 7	val += ls->val;
 8	ls = ls->next;
 9    }
10    return val;
11}
12
13

我们可以先瞅瞅这段c代码被汇编成x86-64的汇编代码是什么样。记得编译选项要 gcc -S -Og,不然默认的优化选项可能不是很容易明白。

得到

 1
 2sum_list:
 3.LFB0:
 4	.cfi_startproc
 5	movl	$0, %eax
 6.L2:
 7	testq	%rdi, %rdi
 8	je	.L4
 9	addq	(%rdi), %rax
10	movq	8(%rdi), %rdi
11	jmp	.L2
12.L4:
13	ret
14
15

同时,我们可以参考CSAPP:3e Figure 4.7 Y86-64的代码示例

 1# Execution begins at address 0
 2.pos 0
 3irmovq stack, %rsp
 4# Set up stack pointer
 5call main
 6# Execute main program
 7halt
 8# Terminate program
 9
10# Array of 4 elements
11.align 8
12array:
13.quad 0x000d000d000d
14.quad 0x00c000c000c0
15.quad 0x0b000b000b00
16.quad 0xa000a000a000
17
18main:
19irmovq array,%rdi
20irmovq $4,%rsi
21call sum
22ret
23# sum(array, 4)
24
25# long sum(long *start, long count)
26# start in %rdi, count in %rsi
27sum:
28irmovq $8,%r8
29# Constant 8
30irmovq $1,%r9
31# Constant 1
32xorq %rax,%rax
33# sum = 0
34andq %rsi,%rsi
35# Set CC
36jmp
37test
38# Goto test
39loop:
40mrmovq (%rdi),%r10
41# Get *start
42addq %r10,%rax
43# Add to sum
44addq %r8,%rdi
45# start++
46subq %r9,%rsi
47# count--. Set CC
48test:
49jne
50loop
51# Stop when 0
52ret
53# Return
54# Stack starts here and grows to lower addresses
55.pos 0x200
56stack:
57

比较容易得到,我们需要的sum_list函数的Y86-64代码是:

 1
 2    .pos 0
 3    irmovq stack, %rsp  # Set up stack pointer
 4    call main # Execute main function
 5    halt
 6
 7# Sample linked list
 8    .align 8
 9ele1:
10    .quad 0x00a
11    .quad ele2
12ele2:
13    .quad 0x0b0
14    .quad ele3
15ele3:
16    .quad 0xc00
17    .quad 0
18
19
20main:
21    irmovq ele1,%rdi
22    call sum_list
23    ret
24
25sum_list:
26    irmovq $0, %rax 
27loop:
28    andq  %rdi, %rdi
29    je finish
30    mrmovq (%rdi),%r8
31    addq %r8, %rax
32    mrmovq 8(%rdi), %rdi
33    jmp loop
34finish:
35    ret
36
37.pos 0x200
38stack:
39

这里有几点要注意:

  • Y86-64的代码最后必须有空行
  • Y86-64的cpu中,ALU的输入不能是立即数,因此如果操作数有立即数,需要先将立即数mov到寄存器中。
  • 与上一条类似,Y86-64的ALU的输入不能是内存中的地址,因此如果操作数有内存中的地址,需要先将改地址的内容mov到寄存器中。
  • Y86-64的mov根据操作数的不同类别分了不同的指令,但是没有根据操作数的size来区分不同指令。
  • Y86-64在ALU做完操作之后会自动设置相应的condition code(CC)

那么如何验证正确性呢?

1
2$  ./yas sum.ys
3$  ./yis sum.yo
4

得到结果:

 1
 2Stopped in 28 steps at PC = 0x13.  Status 'HLT', CC Z=1 S=0 O=0
 3Changes to registers:
 4%rax:   0x0000000000000000      0x0000000000000cba
 5%rsp:   0x0000000000000000      0x0000000000000200
 6%r8:    0x0000000000000000      0x0000000000000c00
 7
 8Changes to memory:
 90x01f0: 0x0000000000000000      0x000000000000005b
100x01f8: 0x0000000000000000      0x0000000000000013
11

%rax中存放的结果为0xcba,结果正确。

Recursively sum linked list elements

思路和上面类似,不多说了。

 1
 2    .pos 0
 3    irmovq stack, %rsp  # Set up stack pointer
 4    call main # Execute main function
 5    halt
 6
 7# Sample linked list
 8    .align 8
 9ele1:
10    .quad 0x00a
11    .quad ele2
12ele2:
13    .quad 0x0b0
14    .quad ele3
15ele3:
16    .quad 0xc00
17    .quad 0
18
19
20main:
21    irmovq ele1,%rdi
22    call rsum_list
23    ret
24
25rsum_list:
26    andq  %rdi, %rdi
27    je finish
28    pushq %rbx
29    mrmovq (%rdi),%rbx     # val
30    mrmovq 8(%rdi), %rdi   # ls->next
31    call rsum_list
32    addq %rbx, %rax
33    popq %rbx
34    ret 
35
36finish:
37    irmovq $0, %rax 
38    ret
39
40.pos 0x200
41stack:
42
43
44
45

Copy a source block to a destination block

注意有三个参数,按照x86-64调用惯例,三个参数依次放入寄存器:

%rdi,%rsi,%rdx

x86-64调用惯例.png

除此以外没什么需要注意的。。

 1
 2    .pos 0
 3    irmovq stack, %rsp  # Set up stack pointer
 4    call main # Execute main function
 5    halt
 6
 7.align 8
 8# Source block
 9src:
10    .quad 0x00a
11    .quad 0x0b0
12    .quad 0xc00
13# Destination block
14dest:
15    .quad 0x111
16    .quad 0x222
17    .quad 0x333
18
19
20main:
21    irmovq src,%rdi
22    irmovq dest,%rsi
23    irmovq $3,%rdx
24
25    call copy_block
26    ret
27
28copy_block:
29    irmovq $8,%r8
30    irmovq $1,%r9
31    irmovq $0, %rcx 
32loop:
33    andq  %rdx, %rdx
34    jle finish
35    mrmovq (%rdi),%rax
36    rmmovq %rax,(%rsi)
37    xorq %rax,%rcx
38    subq %r9, %rdx
39    addq %r8,%rdi
40    addq %r8,%rsi
41    jmp loop
42finish:
43    rrmovq  %rcx,%rax
44    ret
45
46.pos 0x200
47stack:
48
49
50
51

PART B

要求

part B要求扩展 SEQ processor 来实现iadd指令,也就是支持立即数的加法指令。具体的要求可以参考CSAPP:3e的Homework problems 4.51 and 4.52

4.51 ◆ Practice Problem 4.3 introduced the iaddq instruction to add immediate data to a register. Describe the computations performed to implement this instruction. Use the computations for irmovq and OPq (Figure 4.18) as a guide.

4.52 ◆◆ The file seq-full.hcl contains the HCL description for SEQ, along with the declaration of a constant IIADDQ having hexadecimal value C, the instruction code for iaddq. Modify the HCL descriptions of the control logic blocks to implement the iaddq instruction, as described in Practice Problem 4.3 and Problem 4.51. See the lab material for directions on how to generate a simulator for your solution and how to test it.

Figure 4.18 Y86-64指令描述.png

Practice Problem 4.3 practice_4.3.png

实现

先写出iaddq的stage描述:

iaddq V,rB

  • Fetch
    • icode:ifun <-- M1[PC]
    • rA:rB <-- M1[PC+1]
    • valC <-- M8[PC+2]
    • valP <-- PC+10
  • Decode
    • valB <-- R[rB]
  • Execute
    • valE <-- valB + valC
  • Memory
  • Write back
    • R[rB] <-- valE
  • PC update
    • PC <-- valP

然后修改seq-full.hcl。 参考IOPQ 和IIRMOVQ 来实现还是比较容易的。 下面把修改过的位置放了出来:

 1
 2################ Fetch Stage     ###################################
 3
 4bool instr_valid = icode in 
 5	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
 6	       IOPQ, IIADDQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };  
 7             
 8
 9// Does fetched instruction require a regid byte?   
10bool need_regids =
11	icode in { IRRMOVQ, IOPQ, IIADDQ, IPUSHQ, IPOPQ, 
12		     IIRMOVQ, IRMMOVQ, IMRMOVQ };
13
14
15// Does fetched instruction require a constant word?
16bool need_valC =
17	icode in { IIRMOVQ, IIADDQ,IRMMOVQ, IMRMOVQ, IJXX, ICALL };
18
19
20// ################ Decode Stage    ###################################
21
22
23// What register should be used as the B source?
24word srcB = [
25	icode in { IOPQ, IIADDQ, IRMMOVQ, IMRMOVQ  } : rB;
26	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
27	1 : RNONE;  # Don't need register
28];
29
30
31// What register should be used as the E destination?
32word dstE = [
33	icode in { IRRMOVQ } && Cnd : rB;
34	icode in { IIRMOVQ, IOPQ,IIADDQ} : rB;
35	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
36	1 : RNONE;  # Don't write any register
37];
38
39// ################ Execute Stage   ###################################
40
41
42
43// Select input A to ALU
44word aluA = [
45	icode in { IRRMOVQ, IOPQ } : valA;
46	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
47	icode in { ICALL, IPUSHQ } : -8;
48	icode in { IRET, IPOPQ } : 8;
49	# Other instructions don't need ALU
50];
51
52
53// Select input B to ALU
54word aluB = [
55	icode in { IRMMOVQ, IMRMOVQ, IOPQ, IIADDQ, ICALL, 
56		      IPUSHQ, IRET, IPOPQ } : valB;
57	icode in { IRRMOVQ, IIRMOVQ } : 0;
58	# Other instructions don't need ALU
59];
60
61
62// Set the ALU function
63word alufun = [
64	icode == IOPQ : ifun;
65	1 : ALUADD;
66];
67
68
69// Should the condition codes be updated?
70bool set_cc = icode in { IOPQ,IIADDQ };
71
72

测试

在benchmark上测试

 1
 2➜  y86-code git:(master) ✗ make testssim
 3../seq/ssim -t asum.yo > asum.seq
 4../seq/ssim -t asumr.yo > asumr.seq
 5../seq/ssim -t cjr.yo > cjr.seq
 6../seq/ssim -t j-cc.yo > j-cc.seq
 7../seq/ssim -t poptest.yo > poptest.seq
 8../seq/ssim -t pushquestion.yo > pushquestion.seq
 9../seq/ssim -t pushtest.yo > pushtest.seq
10../seq/ssim -t prog1.yo > prog1.seq
11../seq/ssim -t prog2.yo > prog2.seq
12../seq/ssim -t prog3.yo > prog3.seq
13../seq/ssim -t prog4.yo > prog4.seq
14../seq/ssim -t prog5.yo > prog5.seq
15../seq/ssim -t prog6.yo > prog6.seq
16../seq/ssim -t prog7.yo > prog7.seq
17../seq/ssim -t prog8.yo > prog8.seq
18../seq/ssim -t ret-hazard.yo > ret-hazard.seq
19grep "ISA Check" *.seq
20asumr.seq:ISA Check Succeeds
21asum.seq:ISA Check Succeeds
22cjr.seq:ISA Check Succeeds
23j-cc.seq:ISA Check Succeeds
24poptest.seq:ISA Check Succeeds
25prog1.seq:ISA Check Succeeds
26prog2.seq:ISA Check Succeeds
27prog3.seq:ISA Check Succeeds
28prog4.seq:ISA Check Succeeds
29prog5.seq:ISA Check Succeeds
30prog6.seq:ISA Check Succeeds
31prog7.seq:ISA Check Succeeds
32prog8.seq:ISA Check Succeeds
33pushquestion.seq:ISA Check Succeeds
34pushtest.seq:ISA Check Succeeds
35ret-hazard.seq:ISA Check Succeeds
36rm asum.seq asumr.seq cjr.seq j-cc.seq poptest.seq pushquestion.seq pushtest.seq prog1.seq prog2.seq prog3.seq prog4.seq prog5.seq prog6.seq prog7.seq prog8.seq ret-hazard.seq
37

执行回归测试

 1
 2➜  ptest git:(master) ✗ make SIM=../seq/ssim
 3./optest.pl -s ../seq/ssim 
 4Simulating with ../seq/ssim
 5  All 49 ISA Checks Succeed
 6./jtest.pl -s ../seq/ssim 
 7Simulating with ../seq/ssim
 8  All 64 ISA Checks Succeed
 9./ctest.pl -s ../seq/ssim 
10Simulating with ../seq/ssim
11  All 22 ISA Checks Succeed
12./htest.pl -s ../seq/ssim 
13Simulating with ../seq/ssim
14  All 600 ISA Checks Succeed
15➜  ptest git:(master) ✗ make SIM=../seq/ssim TFLAGS=-i
16./optest.pl -s ../seq/ssim -i
17Simulating with ../seq/ssim
18  All 58 ISA Checks Succeed
19./jtest.pl -s ../seq/ssim -i
20Simulating with ../seq/ssim
21  All 96 ISA Checks Succeed
22./ctest.pl -s ../seq/ssim -i
23Simulating with ../seq/ssim
24  All 22 ISA Checks Succeed
25./htest.pl -s ../seq/ssim -i
26Simulating with ../seq/ssim
27  All 756 ISA Checks Succeed
28
29

全部都是" ISA Checks Succeed",说明结果应该OK

PART C

这个lab的前两部分给人一种很简单的错觉。。。。让人觉得,不过如此嘛

然后就死了。。。也可能是我的思路不对。。。

以为homework会比lab内容简单。。。 于是先跑去做了pipe-bt..把分支预测改成not token... 没做出来。。

又去做了pipe-nobypass,改成没有forwarding机制。。 发现要考虑的case多到爆炸。。调到怀疑人生,怀疑我是不是不适合CS.

大概卡了一周。。不得已去搜答案。。发现做课后几个pipeline相关题目的人并不多。。而且似乎比lab本身要难很多。。。

然后循环展开的优化打算读完第五章再说。。

20200425更新:

转眼都快五月了,终于把这个坑填了.

说一下我优化的思路吧. 根据提示,循环展开肯定是一个非常重要的优化. 于是先尝试实现一个2x1的循环展开看下效果.

先把对应的c的代码写了出来:

 1
 2typedef int word_t;
 3word_t ncopy2(word_t *src, word_t *dst, word_t len)
 4{
 5    word_t count = 0;
 6    word_t val;
 7    word_t val2;
 8
 9    while (len > 1) {
10        val = *src++;
11        val2 = *src++;
12        *dst++ = val;
13        *dst++ = val2;
14        if (val > 0)
15            count++;
16        if (val2 > 0)
17            count++;
18        len-=2;
19    }
20    while (len>0){
21        val = *src;
22        *dst = val;
23        if (val>0)
24        count++;
25        len--;
26    }
27    return count;
28}
29

遇到的第一个问题是如何判断len>1.. 如果是x86汇编可以使用cmp指令,但是Y86并没有这个指令.

考虑到cmp是不影响寄存器值的sub,参考How to convert IA32 'cmp' instruction to Y86?

我们可以用栈来保护被sub影响的值

1
2pushl %ecx
3subl  %ebx, %ecx
4popl  %ecx
5

于是得到2x1循环展开的初始版本,CPE是11.6,分数是0

 1
 2# You can modify this portion
 3	# Loop header
 4	xorq %rax,%rax		# count = 0;
 5	irmovq $1, %r10
 6	pushq %rdx
 7	subq  %r10, %rdx
 8	popq  %rdx		# len <= 1?
 9	jle Single		# if so, goto Done:
10
11Loop:	
12	mrmovq (%rdi), %r10	# read val from src...
13	mrmovq 8(%rdi), %r11 # read val2 from src++
14	rmmovq %r10, (%rsi)	# ...and store it to dst
15	rmmovq %r11, 8(%rsi) # .. store val2 to dst++
16	andq %r10, %r10		# val <= 0?
17	jle Npos		# if so, goto Npos:
18	iaddq $1, %rax
19	# irmovq $1, %r10
20	# addq %r10, %rax		# count++
21
22Npos:
23	andq %r11, %r11
24	jle	Npos2
25	# irmovq $1, %r10
26	# addq %r10, %rax
27	iaddq $1, %rax
28
29Npos2:	
30    irmovq $2, %r10
31	subq %r10, %rdx		# len-=2
32	irmovq $16, %r10
33	addq %r10, %rdi		# src++
34	addq %r10, %rsi		# dst++
35	irmovq $1, %r10
36	pushq %rdx
37	subq  %r10, %rdx
38	popq  %rdx		# len > 1?
39
40	jg Loop			# if so, goto Loop:
41
42Single:
43	andq %rdx, %rdx
44	jle Done
45	mrmovq (%rdi),%r10
46	rmmovq %r10, (%rsi)
47	andq %r10, %r10
48	jle Done 
49	# irmovq $1, %r10
50	# addq %r10, %rax		# count++
51	iaddq $1, %rax
52##################################################################
53# Do not modify the following section of code
54# Function epilogue.
55Done:
56	ret
57

接下来,我们发现由于一些指令无法使用立即数作为参数,%r10这个寄存器在频繁地作为临时变量存储一些常数. 我们考虑使用更多的寄存器,来存储这几个常数. CPE可以从11.6提高到10.6. 这个时候,仍然是0分.

想起我们之前实现了的iaddq指令. 不妨把所有的加法和减法运算都用iaddq来替代..CPE课可以从10.6到10.41. 终于有分了.

然后观察代码..发现瓶颈可能在push和pop... 因为压栈和出栈都要访问内存.仔细思考一下,其实不是一定要和x86的cmp指令等价. 因此其实是没必要来保护对应寄存器的值的.

于是得到如下CPE可以到9.48的版本:

 1
 2# You can modify this portion
 3	# Loop header
 4	xorq %rax,%rax		# count = 0;
 5	irmovq $1, %r10
 6	pushq %rdx
 7	subq  %r10, %rdx
 8	popq  %rdx		# len <= 1?
 9	jle Single		# if so, goto Done:
10
11Loop:	
12	mrmovq (%rdi), %r10	# read val from src...
13	mrmovq 8(%rdi), %r11 # read val2 from src++
14	rmmovq %r10, (%rsi)	# ...and store it to dst
15	rmmovq %r11, 8(%rsi) # .. store val2 to dst++
16	andq %r10, %r10		# val <= 0?
17	jle Npos		# if so, goto Npos:
18	iaddq $1, %rax
19	# irmovq $1, %r10
20	# addq %r10, %rax		# count++
21
22Npos:
23	andq %r11, %r11
24	jle	Npos2
25	# irmovq $1, %r10
26	# addq %r10, %rax
27	iaddq $1, %rax
28
29Npos2:	
30    irmovq $2, %r10
31	subq %r10, %rdx		# len-=2
32	irmovq $16, %r10
33	addq %r10, %rdi		# src++
34	addq %r10, %rsi		# dst++
35	irmovq $1, %r10
36	pushq %rdx
37	subq  %r10, %rdx
38	popq  %rdx		# len > 1?
39
40	jg Loop			# if so, goto Loop:
41
42Single:
43	andq %rdx, %rdx
44	jle Done
45	mrmovq (%rdi),%r10
46	rmmovq %r10, (%rsi)
47	andq %r10, %r10
48	jle Done 
49	# irmovq $1, %r10
50	# addq %r10, %rax		# count++
51	iaddq $1, %rax
52##################################################################
53# Do not modify the following section of code
54# Function epilogue.
55Done:
56	ret
57

至此,我们已经确认循环展开是有效的. 因此不妨使用4x1的循环展开. 同时我们删除了一些冗余的代码,最终达到了8.68的CPE.

分数为36.4/60.. 及格了...orz

完整代码可以参考ncopy.ys

然后接下来的优化方向.. 毕竟容易想到的是条件mov的指令.. 但是条件mov指令应该主要控制分支跳转的代价比较大时,才会比较有用.而我们的程序应该不存在比较大的开销.

然后其他的就是.. pipe-full.hcl文件我们除了增加iaddq指令外还没修改过... 可能会有进一步的优化空间

Posts in this Series