CS 3214 Computer Systems
description
Transcript of CS 3214 Computer Systems
CS 3214Computer Systems
Godmar Back
Lecture 6
Announcements
• Exercise 3 & Project 1 posted– Please read instructions first– Must be done on McB 124 machines or on
rlogin cluster
• Minimum Requirements for project 1: phase 4
• Observe submission requirements for exercises
CS 3214 Fall 2010
Summary• Arrays in C
– Contiguous allocation of memory– Pointer to first element– No bounds checking
• Compiler Optimizations– Compiler often turns array code into pointer code (zd2int)– Uses addressing modes to scale array indices– Lots of tricks to improve array indexing in loops
• Structures– Allocate bytes in order declared– Pad in middle and at end to satisfy alignment
• Unions– Overlay declarations– Way to circumvent type system
CS 3214 Fall 2010
x86_64
• 64-bit extension of IA32– aka EM64T (Intel)
• Covered in Chapter 3.13 of 2nd edition
• Don’t confuse with IA64 “Itanium”
CS 3214 Fall 2010
x86_64 Highlights
• Extends 8 general purpose registers to 64bit lengths– And add 8 more 64bit registers
• C Binding: sizeof(int) still 4!; sizeof(anything *), sizeof(long), sizeof(long int) now 8. – NB: sizeof(long long) is 8 both on IA32 and x86_64– Aligns long and double at 8 byte boundaries
• Passing arguments in registers by default• No frame pointer• “Red Zone” of 128 bytes below stack pointer that
can be accessed without rsp movement
CS 3214 Fall 2010
x86_64
CS 3214 Fall 2010
See
http://www.x86-64.org/documentation.html
Inlined Assembly
• asm(“…” : <output> : <input> : <clobber>)• Means to inject assembly into code and link
with remained in a controlled manner• Compiler doesn’t “know” what instructions do
– thus must describe – a) state compiler must create upon enter: which
values must be in which registers, etc.– b) state produced by inline instructions: which
registers contain which values, etc. – also: any registers that may be clobbered
CS 3214 Fall 2010
Inlined Assembly Example
CS 3214 Fall 2010
Goal: exploit imull’sproperty to compute 32x32bit product: imull %ecxmeans (%edx, %eax) := %ecx * %eax
Magic instructions:“r”(leftop) – pick any 32bit register and put leftop in it“a” (rightop) – make sure %eax contains rightop“%2” substitute whichever register picked for ‘leftop’
“=A” result is in (%edx, %eax)“=b” result is in %ebx
bool imul32x32_64(uint32_t leftop, uint32_t rightop, uint64_t *presult){ uint64_t result; bool overflow; asm("imull %2" "\n\t" "seto %%bl" "\n\t" : "=A" (result), "=b" (overflow) // output constraint : "r" (leftop), "a" (rightop) // input constraint ); *presult = result; return overflow;}
bool imul32x32_64(uint32_t leftop, uint32_t rightop, uint64_t *presult){ uint64_t result; bool overflow; asm("imull %2" "\n\t" "seto %%bl" "\n\t" : "=A" (result), "=b" (overflow) // output constraint : "r" (leftop), "a" (rightop) // input constraint ); *presult = result; return overflow;}
Inlined Assembly (2)
CS 3214 Fall 2010
bool imul32x32_64(uint32_t leftop, uint32_t rightop, uint64_t *presult){ uint64_t result; bool overflow; asm("imull %2" "\n\t" "seto %%bl" "\n\t" : "=A" (result), "=b" (overflow) // output constraint : "r" (leftop), "a" (rightop) // input constraint ); *presult = result; return overflow;}
bool imul32x32_64(uint32_t leftop, uint32_t rightop, uint64_t *presult){ uint64_t result; bool overflow; asm("imull %2" "\n\t" "seto %%bl" "\n\t" : "=A" (result), "=b" (overflow) // output constraint : "r" (leftop), "a" (rightop) // input constraint ); *presult = result; return overflow;}
imul32x32_64: pushl %ebp movl %esp, %ebp subl $12, %esp movl %ebx, (%esp) movl %esi, 4(%esp) movl %edi, 8(%esp) movl 8(%ebp), %ecx movl 12(%ebp), %eax#APP imull %ecx seto %bl#NO_APP movl %eax, %esi movl 16(%ebp), %eax movl %esi, (%eax) movl %edx, 4(%eax) movzbl %bl, %eax movl (%esp), %ebx movl 4(%esp), %esi movl 8(%esp), %edi movl %ebp, %esp popl %ebp ret
imul32x32_64: pushl %ebp movl %esp, %ebp subl $12, %esp movl %ebx, (%esp) movl %esi, 4(%esp) movl %edi, 8(%esp) movl 8(%ebp), %ecx movl 12(%ebp), %eax#APP imull %ecx seto %bl#NO_APP movl %eax, %esi movl 16(%ebp), %eax movl %esi, (%eax) movl %edx, 4(%eax) movzbl %bl, %eax movl (%esp), %ebx movl 4(%esp), %esi movl 8(%esp), %edi movl %ebp, %esp popl %ebp ret
Floating Point on IA32
• History:– First implemented in 8087 coprocessor– “stack based” – FPU has 8 registers that form a stack
%st(0), %st(1), …– Known as ‘x87’ floating point
• Weirdness: internal accuracy 80bit (rather than IEEE745 64bit) – thus storing involves rounding– Results depends on how often values are moved out
of the FPU registers into memory (which depends on compiler’s code generation strategy/optimization level) – not good!
CS 3214 Fall 2010
CS 3214 Fall 2010
Floating Point Code Example• Compute Inner
Product of Two Vectors– Single precision
arithmetic– Common computation
float ipf (float x[], float y[], int n){ int i; float result = 0.0; for (i = 0; i < n; i++) { result += x[i] * y[i]; } return result;}
pushl %ebp # setup movl %esp,%ebp pushl %ebx movl 8(%ebp),%ebx # %ebx=&x movl 12(%ebp),%ecx # %ecx=&y movl 16(%ebp),%edx # %edx=n fldz # push +0.0 xorl %eax,%eax # i=0 cmpl %edx,%eax # if i>=n done jge .L3 .L5: flds (%ebx,%eax,4) # push x[i] fmuls (%ecx,%eax,4) # st(0)*=y[i] faddp # st(1)+=st(0); pop incl %eax # i++ cmpl %edx,%eax # if i<n repeat jl .L5 .L3: movl -4(%ebp),%ebx # finish movl %ebp, %esp popl %ebp ret # st(0) = result
Floating Point: SSE(*)
• Various extensions to x87 were introduced:– SSE, SSE2, SSE3, SSE4, SSE5
• Use 16 128bit %xmm registers– Can be used as 16x8bit, 4x32bit, 2x64bit, etc. for both
integer and floating point operations
• Use –fpmath=sse –msse switch to enable (or –msse2, -msse3, -msse4)
• All doubles are 64bits internally - gives reproducible results independent of load/stores– Aside: if 80bit is ok, can combine –fpmath=sse,x87 for
24 registers
CS 3214 Fall 2010
CS 3214 Fall 2010
Floating Point SSE• Same code
compiled with:-msse2 -fpmath=sse
float ipf (float x[], float y[], int n){ int i; float result = 0.0; for (i = 0; i < n; i++) { result += x[i] * y[i]; } return result;}
ipf: pushl %ebp movl %esp, %ebp pushl %ebx subl $4, %esp movl 8(%ebp), %ebx movl 12(%ebp), %ecx movl 16(%ebp), %edx xorps %xmm1, %xmm1 testl %edx, %edx jle .L4 movl $0, %eax ; i = 0 xorps %xmm1, %xmm1; result = 0.0.L5: movss (%ebx,%eax,4), %xmm0 ; t = x[i] mulss (%ecx,%eax,4), %xmm0 ; t *= y[i] addss %xmm0, %xmm1 ; result += t addl $1, %eax ; i = i+1 cmpl %edx, %eax jne .L5.L4: movss %xmm1, -8(%ebp) flds -8(%ebp) ; %st(0) = result addl $4, %esp popl %ebx popl %ebp ret
Vectorization
• SSE* instruction sets can operate on ‘vectors’
• For instance, if 128bit register is treated as (d1, d0) and (e1, e0), can compute (d1+e1, d0+e0) using single instruction – executes in parallel
• Also known as “SIMD”– Single instruction, multiple data
CS 3214 Fall 2010
CS 3214 Fall 2010
Floating Point SSE - Vectorized• Trying to make
compiler achieve transformation shown on rightfloat ipf (float x[], float y[], int n){ int i; float result = 0.0; for (i = 0; i < n; i++) { result += x[i] * y[i]; } return result;}
float ipf_vector (float x[], float y[], int n){ int i; float result = 0.0; for (i = 0; i < n; i+=4) { p[0] = x[i] * y[i]; p[1] = x[i+1] * y[i+1]; p[2] = x[i+2] * y[i+2]; p[3] = x[i+3] * y[i+3]; result += p[0]+p[1]+p[2]+p[3]; } return result;}
Logical transformation, notactual code
Example: GCC Vector Extension
CS 3214 Fall 2010
typedef float v4sf __attribute__ ((vector_size (16)));
float ipf (v4sf x[], v4sf y[], int n){ int i; float partialsum, result = 0.0;
for (i = 0; i < n; i++) { v4sf p = x[i] * y[i];
float * v = (float *)&p; // treat vector as float * partialsum = v[0] + v[1] + v[2] + v[3]; result += partialsum; } return result;}
magic attribute that tells gcc thatv4sf is a type denoting vectors of 4 floatsmagic attribute that tells gcc thatv4sf is a type denoting vectors of 4 floats
Example: GCC Vector Extensions
CS 3214 Fall 2010
typedef float v4sf __attribute__ ((vector_size (16)));
float ipf (v4sf x[], v4sf y[], int n){ int i; float partialsum, result = 0.0;
for (i = 0; i < n; i++) { v4sf p = x[i] * y[i];
float * v = (float *)&p; partialsum = v[0] + v[1] + v[2] + v[3]; result += partialsum; } return result;}
ipf: pushl %ebp movl %esp, %ebp pushl %ebx subl $36, %esp movl 16(%ebp), %ebx movl 8(%ebp), %edx movl 12(%ebp), %eax movl $0, %ecx xorps %xmm1, %xmm1.L5: movaps (%eax), %xmm0 mulps (%edx), %xmm0 movaps %xmm0, -24(%ebp) movss -24(%ebp), %xmm0 addss -20(%ebp), %xmm0 addss -16(%ebp), %xmm0 addss -12(%ebp), %xmm0 addss %xmm0, %xmm1 addl $1, %ecx addl $16, %edx addl $16, %eax cmpl %ebx, %ecx jne .L5 movss %xmm1, -28(%ebp) flds -28(%ebp) addl $36, %esp popl %ebx popl %ebp ret
Comments
• Assembly code on previous slide is slightly simplified (omits first i < n check in case n ==0)
• Two problems with it– Problem 1: ‘partialresult’ is allocated on the
stack• value is said to be “spilled” to the stack
– Problem 2:• Does not use vector unit for computing sum
CS 3214 Fall 2010
SSE3: hadd_ps
• Treats 128bit as 4 floats (“parallel single”)• Input are 2x128bit
(A3, A2, A1, A0) and (B3, B2, B1, B0)• Computes
(B3 + B2, B1 + B0, A3 + A2, A1 + A0) – “horizontal” operation “hadd”
• Apply twice to compute sum of all 4 elements in lowest element
• Use “intrinsics” – look like function calls, but are instructions for the compiler to use certain instructions– Unlike ‘asm’, compiler knows their meaning: no need to
specify input, output constraints, or what’s clobbered– Compiler performs register allocation
CS 3214 Fall 2010
GCC Vector Extensions + XMM Intrinsics
CS 3214 Fall 2010
#include <pmmintrin.h>
typedef float v4sf __attribute__ ((vector_size (16)));
float ipf (v4sf x[], v4sf y[], int n){ int i; float partialsum, result = 0.0; v4sf zero = _mm_setzero_ps(); // intrinsic, produces vector of 4 0.0f
for (i = 0; i < n; i++) { v4sf p = x[i] * y[i];
_mm_store_ss( &partialsum, _mm_hadd_ps(_mm_hadd_ps(p, zero), zero));
result += partialsum; } return result;}
Example: GCC Vector Extensions + XMM Intrinsics
CS 3214 Fall 2010
#include <pmmintrin.h>
typedef float v4sf __attribute__ ((vector_size (16)));
float ipf (v4sf x[], v4sf y[], int n){ int i; float partialsum, result = 0.0; v4sf zero = _mm_setzero_ps();
for (i = 0; i < n; i++) { v4sf p = x[i] * y[i];
_mm_store_ss( &partialsum, _mm_hadd_ps(_mm_hadd_ps(p, zero), zero));
result += partialsum; } return result;}
ipf: pushl %ebp movl %esp, %ebp pushl %ebx subl $4, %esp movl 16(%ebp), %ebx movl 8(%ebp), %edx movl 12(%ebp), %eax movl $0, %ecx xorps %xmm2, %xmm2 xorps %xmm1, %xmm1.L5: movaps (%eax), %xmm0 mulps (%edx), %xmm0 haddps %xmm1, %xmm0 haddps %xmm1, %xmm0 addss %xmm0, %xmm2 addl $1, %ecx addl $16, %edx addl $16, %eax cmpl %ebx, %ecx jne .L5 movss %xmm2, -8(%ebp) flds -8(%ebp) addl $4, %esp popl %ebx popl %ebp ret