Simple HTTP log analyzer

Here is a simple light weight HTTP analyzer written in PHP. I wrote this to brush up my skills writing schedulers for a typical LAMP stack. Nothing fancy, there are more things to implement on detecting malicious requests, but this will get you started –

<?php
/**
//Simple http log parser and report generation script in PHP
//Assumes http log is in combined Log format
**/
//define log file path
$log_file_path = “/usr/local/apache/logs/”;
//$log_file_path = “./”;
//define logfile name – assume timestamp based eg: mmddyyyy_access_log_combined
$log_file_name = date(“dmY”).”access_log_combined”;
//$log_file_name = “samplelog.log”;//used for my unit test;
//#define success codes — 2xx, 3xx
//assume that all 2xx and 3xx are success
$success_codes = array(“200″,”201″,”202″,”203″,”204″,”205″,”206″,”300″,”301″,”302″,”303″,”304″,”305″,”306″,”307”);
//#define malicious access codes — 400, 401, 403, 405, 499, 413, 414, 421, 429, 431, 508, 511
$malicious_codes = array(“400″,”401″,”403″,”405″,”499″,”413″,”414″,”421″,”429″,”431″,”508″,”501”);
//rest of the codes encountered are considered as errors
//#define malicious http methods — OPTIONS, PUT, DELETE
$malicious_http_methods = array(“OPTIONS”,”PUT”,”DELETE”);
/**Below are also common patterns/occurances of malicious attempts which could be implemented, but I just implementing attempts w.r.t http codes and methods
//#define brute force attack = http POST methods executed on the same login page in sub seconds
//#define spiders = http GET methods executed on various pages in sub seconds
//#define xss, cross-site scripting – <script>alert(xss)</script> or <script>alert(hello);</script>
//#define command injection – IP & cat /etc/passwd
//#define path traversal – textfile=../*etc/passwd*
//#define Beef – *beef followed by result_id =
**/
//initialize error count
$errors = 0;
//initialize success count
$success = 0;
//initialize aacessed files array
$accessed_files = array();
//initialize referrer array
$refferers = array();
//initialize user agent array
$user_agent = array();
//initialize malicious access array
$malicious_requests = array();
//Open file in read mode
//get total number of lines
//in while loop
//do checks and store required information
$total_entries = 0;
if(($fp = fopen($log_file_path.$log_file_name, “r”))){
while(($line = fgets($fp)) !== false) {
$line_chunks = split(‘”‘,$line);
//chunk that has <http method> <file> <httpver>
if(isset($line_chunks) && is_array($line_chunks)){
$accessed_files_prep = split(” “,$line_chunks[1]);
$accessed_files[] = $accessed_files_prep[1];
                        $status_codes_prep = split(” “,trim($line_chunks[2]));
if(in_array($status_codes_prep[0],$success_codes)){
$success++;
}
else if(in_array($status_codes_prep[0],$malicious_codes) || in_array($line_chunks[0],$malicious_http_methods)){
$malicious_requests[] = $line;
}
else{
$errors++;
}
                        $refferers[] = $line_chunks[3];
$user_agent[] = $line_chunks[5];
}
$total_entries++;
}
fclose($fp);
}
else{
echo “Log file does not exist!”;
}
echo “Total entries >> “.$total_entries.”\n\n”;
echo “Total Success >> “.$success.”\n\n”;
echo “Total Errors >> “.$errors.”\n\n”;
echo “\n\nBelow were malicious requests>>\n”;
print_r($malicious_requests);
echo “\n\nTop Accessed Files >>\n”;
$top_accessed = array_top_recurring($accessed_files,$total_entries);
print_r($top_accessed);
echo “\n\nTop Refferers >>\n”;
$top_refferers= array_top_recurring($refferers,$total_entries);
print_r($top_refferers);
echo “\n\nTop User Agents >>\n”;
$top_user_agent= array_top_recurring($user_agent,$total_entries);
print_r($top_user_agent);
//function to group by array values and find top x %
function array_top_recurring($arr,$total_ent){
$count=array_count_values($arr);
arsort($count);
$top = 20;
$top_cnt = 0;
$final_arr = array();
foreach($count as $key => $val){
$final_arr[$key] = round(($val/$total_ent)*100);
$final_arr[$key] .= “%”;
$top_cnt++;
if($top_cnt >= $top){
return $final_arr;
}
}
}
?>