清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#############################程序说明############################# #1.输出路径为当前运行目录上级目录中建立 result文件夹 #2.在result文件夹中建立url_data,url_result,app_result,app_data三个文夹 #3.url_data 待解析URL数据存放目录 #4.url_result 已解析为噪音URL的数据存放目录 #4.app_result 已解析为APP应用所存放文件目录 #5.app_data 待处APP数理 #6.result/log_out.log 处理时间 #7.字段对应说明如下: #$1:IMSI $2:MDN $3:MEID $4:DestinationIP $5:DestinationPort $6:SourceIP #$7:SourcePort $8:ProtocolID $17:ServiceType $10:StartTime $11:EndTime #$12:Duration $13:InputOctets $14:OutputOctets $26:DestinationURL ################################################################ #!/bin/sh awk -F '|' 'BEGIN{ #取得当前主机名,为每个机器单独部署时文件起一个别名 "hostname" | getline file_name_everyone; OFS=","; is_null = ""; url_filename=file_name_everyone"_url.txt"; noice_filename=file_name_everyone"_noice.txt"; app_filename=file_name_everyone"_app.txt"; app_a19_filename=file_name_everyone"_app_a19.txt"; app_result_filename=file_name_everyone"_app_result.txt"; app_data_filename=file_name_everyone"_app_data.txt" app_flag="a19"; haed_http="http://"; print "......解析文件开始........" strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"} END{print "......解析文件结束........" strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"} { #文件分割 当文件条数为总记录条数50w的倍数时进行拆解文件,500000记录大小为:30M如需要更大的文件,可改变此大小 #3200000 约等于130M 57000000 约等于2G if(NR % 3200000 ==0){ url_filename=file_name_everyone"_"NR"_url.txt"; noice_filename=file_name_everyone"_"NR"_noice.txt"; app_filename=file_name_everyone"_"NR"_app.txt"; app_a19_filename=file_name_everyone"_"NR"_app_a19.txt"; } #初始化序列号 fruit_num = $2; if(0 == (match(tolower($26),/.*(\.js\b|\.js\W|\.img|\.inf|\.dat|\.dwr|\.fla|\.mp4|\.cmr|\.asm|\.cfg|\.amr|\.war|\.tdz|\.md5|\.jar|\.cmd|\.gif|\.png|\.jpeg|\.bmp|\.def|\.jpg|\.css|\.ico|\.cur|\.swf|\.txt|\.avi|\.xml|\.zip|\.cab|\.crl|\.mp3|\.tpt|\.fcg|\.lrc|\.action|\.rar|\.m4a|\.idx|\.exe|\.dll|\.ini|\.vbs|\.doc|\.flv).*/)) && (length($5)>0 && length($6)>0 && length($7)>0 && length($8)>0 && length($16)>0)){ #当协议类型为http或wap时 if($16==1 || $16==2){ if(length($26)>0){ #拆分URL split($26,url,"/"); #按问号拆分变量 split($26,value,"?"); #当url以http 或HTTP 或https if(url[1]=="http:" || url[1]=="HTTP:" || url[1]=="HTTPS:" || url[1]=="https:" ){ #当url按斜杠拆分后长度为5时,为没有二级域名时 if(length(url)==5){ print fruit_num,$16,$17,$26,url[3],url[4],is_null,substr($26,length(value[1])+2) >> "../result/url_data/"url_filename; } else { print fruit_num,$16,$17,$26,url[3],url[4],url[5],substr($26,length(value[1])+2) >> "../result/url_data/"url_filename; } #当url不以http 或HTTP开头时 }else { if(length(url)==3){ print fruit_num,$16,$17,haed_http$26,url[1],url[2],is_null,substr($26,length(value[1])+2) >> "../result/url_data/"url_filename; } else { print fruit_num,$16,$17,haed_http$26,url[1],url[2],url[3],substr($26,length(value[1])+2) >> "../result/url_data/"url_filename; } } } else { print fruit_num,$26,$16,$17,1,is_null >> "../result/url_result/"noice_filename; } #协议3:SMTP 4:POP3 5:IMAP4 7:RTSP } else if($16==3 || $16==4 || $16==5){ if($17==399 || $17==499 || $17==599){ print fruit_num,$26,$16,$17,app_flag,is_null,is_null >> "../result/app_result/"app_a19_filename; } #协议6:FTP 8:MMS } else if($16==6 || $16==8){ if($17==699 || $17==899){ print fruit_num,$26,$16,$17,0,-1,is_null >> "../result/app_result/"app_result_filename; } } else if($16==7){ if($17==799){ print fruit_num,$5,$6,$16,$17 >> "../result/app_data/"app_data_filename; } } } else { print fruit_num,$26,$16,$17,1,is_null >> "../result/url_result/"noice_filename; } }' *.txt