Teaching Your Machine To Find Fraudsters

Post on 17-May-2015

1.557 views 1 download

Tags:

description

The slides from my talk at PHP Tek 11. When dealing with money online, fraud is an ongoing problem for bothconsumers and sellers. Researchers have been developing statisticaland machine learning techniques to detect shady sellers on auctionsites, spot fraudulent payments on e-commerce systems and catch clickfraud on adverts. While there is no silver bullet, you will learn toflag suspicious activity and help protect your site from scammersusing PHP and a little help from some other technologies.

Transcript of Teaching Your Machine To Find Fraudsters

TEACHING YOUR MACHINE TO FIND FRAUDSTERS

Ian Barberianb@php.netphpir.com twitter.com/ianbarber

5%3%

.1%8%

SOME SMALL NUMBERS

99%ACCURACY

REALLY LEGITIMATE

REALLY FRAUD

EVALUATED LEGITIMATE

989 0

EVALUATED FRAUD

10 1

REALLY LEGITIMATE

REALLY FRAUD

EVALUATED LEGITIMATE

989 0

EVALUATED FRAUD

10 1

90% WRONG

ANOMALY DETECTION

0

7.5

15

22.5

30

Clic

ks

Date

Detector

User Clicks Ad

Alarm

No Alarm

Landing Page

SOFTWAREARCHITECTURE

Buffer

Threshold

ExpectedClicks

Alarm

Sensitivity

Data Buffer

statistics

DETECTOR

function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);

average.php

0

0.05

0.1

0.15

0.2

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd);}

0

7.5

15

22.5

30

Clic

ks

Date

18 False Alarms 1 Day To Detect

1.6 SENSITIVITY

0

7.5

15

22.5

30

Clic

ks

Date

1 False Alarm 18 Days To Detect

2.7 SENSITIVITY

SICKNESSAVAILABILITY

function detect($sens) { $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) {

$sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);

sickavail.php

$avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc

if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } }

} // end if

0

0.05

0.1

0.15

0.2

1 2 3 4 5 6 7 8 9 10

0

7.5

15

22.5

30

Clic

ks

Date

1 False Alarm 1 Day To Detect

0.011 SENSITIVITY

SUPERVISED CLASSIFIERS

Classifier

User Purchase

Fraud

Not FraudTransactionProcessor

Transaction Database Learner

classification model

SOFTWAREARCHITECTURE

LearnerTraining Data Model

Model

Test Data

ClassifierPredictionAccuracy

EVALUATING THE CLASSIFIER

0

5

10

15

20

0 5 10 15 20

?

0

5

10

15

20

0 5 10 15 20

?

0

5

10

15

20

0 5 10 15 20

$docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );

$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);

foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key);}$db = null;

fraudknn.php

$test = array( 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN');

$db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN);$idx = new XapianTermGenerator();$stem = new XapianStem("english");$idx->set_stemmer($stem);

$xdoc = new XapianDocument();$idx->set_document($xdoc);$idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']);$id = $db->add_document($xdoc);

testknn.php

$enq = new XapianEnquire($db);$rset = new XapianRSet();$rset->add_document($id);$eset = $enq->get_eset(10, $rset);$terms = array();$i = $eset->begin();while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next();}

$q = new XapianQuery( XapianQuery::OP_OR, $terms);$enq->set_query($q);$matches = $enq->get_mset(0, 4, $rset);

$i = $matches->begin();while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next();}$db->delete_document($id);

$ php testknn.php string(5) "clean"string(5) "fraud"string(5) "fraud"

TRANSACTION PARAMETERS

function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/\..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; }

email.php

if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; }

return -1;}

$data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);

SUPPORT VECTOR MACHINES

0

5

10

15

20

0 5 10 15 20

0

5

10

15

20

0 5 10 15 20

0

5

10

15

20

0 5 10 15 20

0

5

10

15

20

0 5 10 15 20

0

5

10

15

20

0 5 10 15 20

$ apt-get install libsvm-dev$ apt-get install libsvm-tools

$ yum install libsvm-devel

$ pecl install svm-beta$ echo extension=svm.so > /etc/php.d/svm.ini$ php -r '$s = new svm(); $m = $s->train(array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));'-1

$fh = fopen('paydata.csv', 'r');$output = array();

while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty );} learn.php

$svm = new svm();$model = $svm->train($output, array(-1 => 0.65, 1 => 0.5));$model->save('learn.model');

$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}

// ...snip.. loading test data from // paytest.csv

$model = new SVMModel('learn.model');

$fp = $tp = $fn = $tn = 0;foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } }}

test.php

var_dump("True Positive " . $tp);var_dump("True Negative " . $tn);var_dump("False Positive " . $fp);var_dump("False Negative " . $fn);var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));

$ php learn.phpstring(18) "True Positive 8316"string(18) "True Negative 1682"string(16) "False Positive 2"string(16) "False Negative 0"string(15) "Accuracy 0.9998"

$ php test.phpstring(17) "True Positive 844"string(17) "True Negative 155"string(16) "False Positive 0"string(16) "False Negative 1"string(14) "Accuracy 0.999"

Test Verify Update

Automated Manual Manual

training data

Time Series Class Based

Sensitivity Model

Days To Detect

False Alarms

False Positives

False Negatives

(shogun)

TEACHING YOUR MACHINE TO FIND FRAUDSTERS

http://joind.in/3429

Ian Barberianb@php.net

Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Departmenthttp://www.flickr.com/photos/mike_miley/3929146730/Model Buildings - Ah Ain’t Long For This Whorlhttp://www.flickr.com/photos/chadmiller/98014022/Repeat Customer - McDonald’s Loyalty Cardhttp://www.flickr.com/photos/fsse-info/3658873057/Shipping - FedEx Truckhttp://www.flickr.com/photos/moto_club4ag/4852235145/Velocity - Chevrolet Chevelle Dragsterhttp://www.flickr.com/photos/jns001/2958999006/GeoIP - Earth Asia Terminator Viewhttp://www.flickr.com/photos/flyingsinger/86898564/Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/