CPU Performance in Java.

19
CPU Performance in Java.

Transcript of CPU Performance in Java.

Page 1: CPU Performance in Java.

CPU Performance in Java.

Page 2: CPU Performance in Java.

Перед началом оптимизации

● а это нужно вообще?● метрики● качественные бенчмарки● корректность измерений● узкое место

Page 3: CPU Performance in Java.

Откуда ноги растут?

● плохая архитектура● проблемы с потоками● неэффективные алгоритмы● неразумные структуры данных● неаккуратная работа с памятью● потери с I/O и работой с сетью● ...

Page 4: CPU Performance in Java.

Задача: Ускорить кодint N = 8192;

byte[][] arr = new byte[N][N];

static boolean check(byte[][] arr, int N)

{

int count = 0;

for(int i=0; i< N; i++)

for(int j=0; j< N; j++)

if(arr[j][i] < 0)

count--;

return count < 0;

}

3_744 ms

Page 5: CPU Performance in Java.

Нужно разобраться с устройством CPU

Page 6: CPU Performance in Java.

Если проще

Page 7: CPU Performance in Java.

CPU Cache

Page 8: CPU Performance in Java.

Вариант 1! Уже хорошо.int N = 8192;

byte[][] arr = new byte[N][N];

static boolean check(byte[][] arr, int N)

{

int count = 0;

for(int i=0; i< N; i++)

for(int j=0; j< N; j++)

if(arr[i][j] < 0)

count--;

return count < 0;

}

264 ms

Page 9: CPU Performance in Java.

Вариант 2! Отлично.int N = 8192;

byte[][] arr = new byte[N][N];

static boolean check(byte[][] arr, int N)

{

int count = 0;

for(int i=0; i< N; i++)

for(int j=0; j< N; j++)

count += arr[i][j] >> 7;

return count < 0;

}

214 ms

Page 10: CPU Performance in Java.

Итого:3744/214=17.5 (результат зависит от CPU)

Page 11: CPU Performance in Java.

Нужно все распоточить !

Page 12: CPU Performance in Java.

Работает медленно.

public class IterationThread implements Runnable { private int index; private long iterations;

public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; }

@Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index]; } }}

public class FalseSharing {

private static volatile long arr[] = new long[512];

private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L;

public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT];

for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); }

long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); }

}

25_406 ms

Page 13: CPU Performance in Java.

Не все так просто. False sharing.

false sharing означает доступ к разным объектам в программе, разделяющим один и тот же блок кэш-памяти.

Page 14: CPU Performance in Java.

Работает хорошо.

public class IterationThread implements Runnable { private int index; private long iterations;

public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; }

@Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index]; } }}

public class TrueSharing {

private static volatile long arr[] = new long[512];

private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L;

public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT];

for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, (i+1)*8)); }

long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); }

}

4_949 ms

Page 15: CPU Performance in Java.

А что если нужно использовать свой собственный класс ???

Page 16: CPU Performance in Java.

False sharing with custom objectpublic static class IterationThread implements Runnable { private int index; private long iterations;

public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; }

@Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index].val; } }}

public class FalseSharing {private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();private static final long ITERATIONS = 2_000_000_000L;private static MyObject arr[] = new MyObject[THREAD_COUNT];

static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); }}

public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT];

for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start));}}

149_743 ms

public static class MyObject{ public volatile long val = 0L;}

Page 17: CPU Performance in Java.

Java 7 Padding.public static class IterationThread implements Runnable { private int index; private long iterations;

public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; }

@Override public void run() { for(long l = 0; l < iterations; ++l) { arr[index].incrementAndGet(); } }}

public class FalseSharing {private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();private static final long ITERATIONS = 2_000_000_000L;private static MyObject arr[] = new MyObject[THREAD_COUNT];

static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); }}

public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT];

for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start));}}

14_539 ms

public static class MyObject extends AtomicLong { public volatile long p1, p2, p3, p4, p5, p6 = 7L;}

Page 18: CPU Performance in Java.

Java 8. @sun.misc.Contendedpublic static class IterationThread implements Runnable { private int index; private long iterations;

public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; }

@Override public void run() { for(long l = 0; l < iterations; ++l) { arr[index].incrementAndGet(); } }}

public class FalseSharing {private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();private static final long ITERATIONS = 2_000_000_000L;private static MyObject arr[] = new MyObject[THREAD_COUNT];

static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); }}

public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT];

for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start));}}

14_983 ms

// unlock JVM option: -XX:-RestrictContended@Contendedpublic static class MyObject extends AtomicLong { public volatile long anyVal;}

Page 19: CPU Performance in Java.

Спасибо за внимание