Arrays versus Pointers Array approach proc1 ( int v[ ], int n ) { int i; for ( i = 0; i < n; i = i + 1) Some function of v [ i ]; } Pointer approach &v[0] means address of v[0] proc2 ( int *v, int n ) *p means the object pointed to { by p int *p; for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p }
Arrays versus Pointers Pointer approach &v[0] means address of v[0] proc2 ( int *v, int n ) *p means the object pointed to { by p int *p; for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } &v[n] p &v[0] v[n] v[n-1] • v[j] v[2] v[1] v[0]
Array approach - Ignore linkage code proc1 ( int v[ ], int n ) { int i; $a0 is v base, $a1 is n, $t0 is i for ( i = 0; i < n; i = i + 1) Some function of v [ i ];} add $t0, $zero, $zero # i = 0 L1: add $t1, $t0, $t0 # $t1 = 2*i add $t1, $t1, $t1 # $t1 = 4*i add $t2, $a0, $t1 # $t2 = addr of v[i]
Array approach - Ignore linkage code proc1 ( int v[ ], int n ) { int i; $a0 is v base, $a1 is n, $t0 is i for ( i = 0; i < n; i = i + 1) Some function of v [ i ];} add $t0, $zero, $zero # i = 0 L1: add $t1, $t0, $t0 # $t1 = 2*i add $t1, $t1, $t1 # $t1 = 4*i add $t2, $a0, $t1 # $t2 = addr of v[i] lw $s0, 0 ( $t2 ) # $s0 = v[i] Some function of v[ i]
Array approach - Ignore linkage code proc1 ( int v[ ], int n ) { int i; $a0 is v base, $a1 is n, $t0 is i for ( i = 0; i < n; i = i + 1) Some function of v [ i ];} add $t0, $zero, $zero # i = 0 L1: add $t1, $t0, $t0 # $t1 = 2*i add $t1, $t1, $t1 # $t1 = 4*i add $t2, $a0, $t1 # $t2 = addr of v[i] lw $s0, 0 ( $t2 ) # $s0 = v[i] Some function of v[ i] addi $t0, $t0, 1 # i = i + 1
Array approach - Ignore linkage code proc1 ( int v[ ], int n ) { int i; $a0 is v base, $a1 is n, $t0 is i for ( i = 0; i < n; i = i + 1) Some function of v [ i ];} add $t0, $zero, $zero # i = 0 L1: add $t1, $t0, $t0 # $t1 = 2*i add $t1, $t1, $t1 # $t1 = 4*i add $t2, $a0, $t1 # $t2 = addr of v[i] lw $s0, 0 ( $t2 ) # $s0 = v[i] Some function of v[ i] addi $t0, $t0, 1 # i = i + 1 slt $t3, $t0, $a1 # $t3=1 if i < n bne $t3, $zero, L1 # if $t3=1 goto L1
Pointer approach - Ignore Linkage proc2 ( int *v, int n ) { int *p; $a0 is v base, $a1 is n, $t0 is p for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } add $t0, $a0, $zero # p = addr of v[0]
Pointer approach - Ignore Linkage proc2 ( int *v, int n ) { int *p; $a0 is v base, $a1 is n, $t0 is p for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } add $t0, $a0, $zero # p = addr of v[0] add $t1, $a1, $a1 # $t1 = 2 * n add $t1, $t1, $t1 # $t1 = 4 * n add $t2, $a0, $t1 # $t2 = addr of v[n]
Pointer approach - Ignore Linkage proc2 ( int *v, int n ) { int *p; $a0 is v base, $a1 is n, $t0 is p for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } add $t0, $a0, $zero # p = addr of v[0] add $t1, $a1, $a1 # $t1 = 2 * n add $t1, $t1, $t1 # $t1 = 4 * n add $t2, $a0, $t1 # $t2 = addr of v[n] L2: lw $s0, 0 ( $t0 ) # $s0 = *p Some function of *p
Pointer approach - Ignore Linkage proc2 ( int *v, int n ) { int *p; $a0 is v base, $a1 is n, $t0 is p for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } add $t0, $a0, $zero # p = addr of v[0] add $t1, $a1, $a1 # $t1 = 2 * n add $t1, $t1, $t1 # $t1 = 4 * n add $t2, $a0, $t1 # $t2 = addr of v[n] L2: lw $s0, 0 ( $t0 ) # $s0 = *p Some function of *p addi $t0, $t0, 4 # p = p + 4
Pointer approach - Ignore Linkage proc2 ( int *v, int n ) { int *p; $a0 is v base, $a1 is n, $t0 is p for ( p = &v[0], p < &v[ n ], p = p + 1) Some function of *p } add $t0, $a0, $zero # p = addr of v[0] add $t1, $a1, $a1 # $t1 = 2 * n add $t1, $t1, $t1 # $t1 = 4 * n add $t2, $a0, $t1 # $t2 = addr of v[n] L2: lw $s0, 0 ( $t0 ) # $s0 = *p Some function of *p addi $t0, $t0, 4 # p = p + 4 slt $t3, $t0, $t2 # $t3=1 if p < addr of v[n] bne $t3, $zero, L2 # if $t3=1 goto L2
Array approach - Ignore linkage code proc1 ( int v[ ], int n ) { int i; $a0 is v base, $a1 is n, $t0 is i for ( i = 0; i < n; i = i + 1) Some function of v [ i ];} add $t0, $zero, $zero # i = 0 L1: add $t1, $t0, $t0 # $t1 = 2*i add $t1, $t1, $t1 # $t1 = 4*i add $t2, $a0, $t1 # $t2 = addr of v[i] lw $s0, 0 ( $t2 ) # $s0 = v[i] Some function of v[ i] addi $t0, $t0, 1 # i = i + 1 slt $t3, $t0, $a1 # $t3=1 if i < n bne $t3, $zero, L1 # if $t3=1 goto L1
The Pointer approach has 3 less instructions in the loop The pointer is directly incremented by 4 This avoids multiplying the index by 4 every pass
Design Principles Simplicity favors regularity All instructions the same size Always 3 register operands in arithmetic instructions Register fields same place in each format
Design Principles Simplicity favors regularity Smaller is faster 32 Registers Reduced number of instructions
Design Principles Simplicity favors regularity Smaller is faster Good design demands good compromise Word length vs. address and constant length
Design Principles Simplicity favors regularity Smaller is faster Good design demands good compromise Make the common case fast Immediate addressing for constant operands PC-relative addressing for branches
Design Principles Simplicity favors regularity Smaller is faster Good design demands good compromise Make the common case fast Evolution from CISC ( Complex Instruction set Computers ) to RISC ( Reduced Instruction Set Computers)
Different Architectures Accumulator Architecture One register and one operand Ex: A = B + C load AddressB # Acc = B add AddressC # Acc = B + C store AddressA # A = B + C
Different Architectures Accumulator Architecture One register and one operand Ex: A = B + C load AddressB # Acc = B add AddressC # Acc = B + C store AddressA # A = B + C Register – Memory Architecture A few registers and two operands Ex: A = B + C , assume B is in Reg 2 add Reg2, AddressC # Reg 1 = Reg 2 + C store Reg1, AddressA # A = B + C
Different Architectures Accumulator Architecture One register and one operand Register – Memory Architecture A few registers and two operands Load- Store (Register – Register) Architecture Many registers and three operands Ex: A = B + C add Reg1, Reg2, Reg3
Year Machine Registers Architecture Instr Length 1953 IBM 701 1 accumulator 1963 CDC6600 8 load-store 1964 IBM 360 16 register-memory 2-6 bytes 1970 DEC PDP-11 8 register-memory 1972 Intel 8008 1 accumulator 1974 Motorola 6800 2 accumulator 1977 DEC VAX 16 register-memory 1-54 bytes 1978 Intel 8086 1 extended accum 1980 Motorola 68000 16 register-memory 1985 Intel 80386 8 register-memory 1 –17 bytes 1985 MIPS 32 load-store 4 bytes 1987 SPARC 32 load-store 4 bytes Power PC 32 load-store 4 bytes DEC Alpha 32 load-store 4 bytes 2003 Intel Itanium 128 + load-store 3 in 16 bytes
Performance of Computers Measure Performance Compare Measurements Understand what affects the measurements
Performance of Computers Measure Performance Compare Measurements Understand what affects the measurements Uses Selection of Computers Optimization of the Design of Architecture Software Hardware
Comparing Airplanes Airplane Capacity Speed (mph) Throughput Boeing 777 375 610 228,750 Boeing 747 470 610 286,700 Concorde 132 1350 178,200 Douglas DC-8-50 146 544 79,424 Assess Performance by Specifications. Which is the highest performer ? Speed Throughput
Hamburger Stand Customers Cashier Cook Customer Survey: Takes Too Long
Hamburger Stand Customers Cashier Cook Customer Survey: Takes Too Long Take orders faster ( Initial Response Time ) Same Time to Burger ( Throughput )
Performance Measures Criteria depends on the User ( Application)
Performance Measures Criteria depends on the User ( Application) Must examine the complete process ( Specs can mislead)
Performance Measures Criteria depends on the User ( Application) Must examine the complete process ( Specs can mislead) Balancing the performance of the parts ( Cook limited!)
Definition of Performance for Machine X for some task 1 Performance of X = Execution Time of X If the Performance of X is greater than Y Performance of X > Performance of Y Execution Time of X < Execution Time of Y
Machine X is n times faster than Y means: Performance of X Performance of Y = n Then Execution Time of Y Execution Time of X = n
Ex: Machine X runs a program in 0.15 sec and Machine Y takes 0.3 sec Execution Time of Y 0.3 Execution Time of X 0.15 n = = = 2 Machine X is 2 times faster than Y or Machine Y is 2 times slower than X To minimize confusion we will use: Faster to compare machines Improve performance and execution time
Execution Time is the total task time including OS overhead, memory accesses, disk accesses, etc. To relate to different specifications, another metric is CPU Execution Time : the time the CPU spends working on the task. Does not include waiting time and overhead For a program: CPU Execution Time = CPU Clock Cycles x Clock Cycle Time = Instructions per program x CPU Clock Cycles per Instruction x Clock Cycle Time
Aspects of CPU Performance CPU time = Seconds = Instructions x Cycles x Seconds Program Program Instruction Cycle instr. count CPI clock rate Program Compiler Instr. Set Arch Implementation Technology
Aspects of CPU Performance CPU time = Seconds = Instructions x Cycles x Seconds Program Program Instruction Cycle instr. count CPI clock rate Program X Compiler X Instr. Set Arch X Implementation Technology
Aspects of CPU Performance CPU time = Seconds = Instructions x Cycles x Seconds Program Program Instruction Cycle instr. count CPI clock rate Program X Compiler X Instr. Set Arch X X Implementation X Technology X
Aspects of CPU Performance CPU time = Seconds = Instructions x Cycles x Seconds Program Program Instruction Cycle instr. count CPI clock rate Program X Compiler X Instr. Set Arch X X Implementation X X Technology X X
CPI CPI = Clock Cycles / Instruction “Average clock cycles per instruction” CPI = Clock Cycles / Instruction
CPI CPI = Clock Cycles / Instruction “Average clock cycles per instruction” CPI = Clock Cycles / Instruction CPU Time = Instructions x CPI / Clock Rate = Instructions x CPI x Clock Cycle Time
CPI CPI = Clock Cycles / Instruction “Average clock cycles per instruction” CPI = Clock Cycles / Instruction CPU Time = Instructions x CPI / Clock Rate = Instructions x CPI x Clock Cycle Time Average CPI = SUM of CPI (i) * I(i) for i=1, n Instruction Count
CPI Invest Resources where time is Spent! “Average clock cycles per instruction” CPI = Clock Cycles / Instruction Count = (CPU Time * Clock Rate) / Instruction Count CPU Time = Instruction Count x CPI / Clock Rate = Instruction Count x CPI x Clock Cycle Time Average CPI = SUM of CPI (i) * I(i) for i=1, n Instruction Count Average CPI = SUM of CPI(i) * F(i) for i = 1, n F(i) is the Instruction Frequency Invest Resources where time is Spent!
CPI Example Suppose we have two implementations of the same instruction set For some program, Machine A has: a clock cycle time of 10 ns. and a CPI of 2.0 Machine B has: a clock cycle time of 20 ns. and a CPI of 1.2 What machine is faster for this program, and by how much?
CPI Example Suppose we have two implementations of the same instruction set For some program, Machine A has: CPU Time = I*2.0*10ns=I*20ns a clock cycle time of 10 ns. and a CPI of 2.0 Machine B has: a clock cycle time of 20 ns. and a CPI of 1.2 What machine is faster for this program, and by how much?
CPI Example Suppose we have two implementations of the same instruction set For some program, Machine A has: CPU Time = I*2.0*10ns=I*20ns a clock cycle time of 10 ns. and a CPI of 2.0 Machine B has: CPU Time = I*1.2*20ns=I*24ns a clock cycle time of 20 ns. and a CPI of 1.2 What machine is faster for this program, and by how much?
CPI Example Suppose we have two implementations of the same instruction set For some program, Machine A has: CPU Time = I*2.0*10ns=I*20ns a clock cycle time of 10 ns. and a CPI of 2.0 Machine B has: CPU Time = I*1.2*20ns=I*24ns a clock cycle time of 20 ns. and a CPI of 1.2 What machine is faster for this program, and by how much? A is 24/20 =1.2 faster than B
CPI Example Suppose we have two implementations of the same instruction set For some program, Machine A has: CPU Time = I*2.0*10ns=I*20ns a clock cycle time of 10 ns. and a CPI of 2.0 Machine B has: CPU Time = I*1.2*20ns=I*24ns a clock cycle time of 20 ns. and a CPI of 1.2 What machine is faster for this program, and by how much? A is 24/20 =1.2 faster than B Note: CPI is Smaller for B