第二届高校大数据比赛之鼠标轨迹识别
比赛地址http://bdc.saikr.com/c/cql/34541
赛题
鼠标轨迹识别当前广泛运用于多种人机验证产品中,不仅便于用户的理解记忆,而且极大增加了暴力破解难度。但攻击者可通过黑产工具产生类人轨迹批量操作以绕过检测,并在对抗过程中不断升级其伪造数据以持续绕过同样升级的检测技术。我们期望用机器学习算法来提高人机验证中各种机器行为的检出率,其中包括对抗过程中出现的新的攻击手段的检测。
数据格式
评测指标
F = 5PR/(2P+3R)*100
数据读取和处理
#####数据读取和处理
import pandas as pd
import os
def get_data(file):
data1=[]
count=0
with open(file) as f:
for i in f.readlines():
count+=1
arr=i.split(" ")[1].split(";")[:-1]
for j in arr:
temp=[count]
temp.extend(j.split(","))
data1.append(temp)
data2=[]
with open(file) as f:
for i in f.readlines():
count+=1
arr=i.split(" ")[2]
data2.append(arr.split(","))
data=pd.DataFrame(data1,columns=["id","x","y","t"])
d2=pd.DataFrame(data2,columns=["target_x","target_y"])
d2.target_y=d2.target_y.apply(lambda x:x[:-1])
d2["id"]=range(1,100001)
data=pd.merge(data,d2,on="id")
return data
数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
# plt.xticks(list(range(len(b))),b["x"].values)
import os
path="F:\competition_data\Bigdata\images"
# os.mkdir(path)
for i in range(1,3001):
b=data[data.id==i]
k=list(b["x"].values)
# k.extend(set(b["target_x"].values))
l=list(b["y"].values)
# l.extend(set(b["target_y"].values))
plt.plot(k,l,"o-")
fig = plt.gcf()
fig.set_size_inches(30, 15)
fig.savefig(path+"\"+str(i)+".png",dpi=100)
plt.close()
特征提取
###特征提取
def get_features(data):
a=pd.DataFrame()
data_length=len(set(data.id.values))
import numpy as np
for i in range(data_length):
test=data[data.id==i]
if len(test)!=1:
test.index=range(len(test))
temp=test[["x","y","t"]].diff(1).dropna()
temp["distance"]=np.sqrt(temp["x"]**2+temp["y"]**2)
temp["speed"]=np.log1p(temp["distance"])-np.log1p(temp["t"])
temp["angles"]=np.log1p(temp["y"])-np.log1p(temp["x"])
speed_diff=temp["speed"].diff(1).dropna()
angle_diff=temp["angles"].diff(1).dropna()
test["distance_aim_deltas"]=np.sqrt((test["x"]-test["target_x"])**2+(test["y"]-test["target_y"])**2)
distance_aim_deltas_diff=test["distance_aim_deltas"].diff(1).dropna()
arr=pd.DataFrame(index=[0])
arr["id"]=i
arr["speed_diff_median"] = speed_diff.median()
arr["speed_diff_mean"] = speed_diff.mean()
arr["speed_diff_var"] = speed_diff.var()
arr["speed_diff_max"] = speed_diff.max()
arr["angle_diff_var"] = angle_diff.var()
arr["time_delta_min"] = temp["t"].min()
arr["time_delta_max"] = temp["t"].max()
arr["time_delta_var"] = temp["t"].var()
arr["distance_deltas_max"] = temp["distance"].max()
arr["distance_deltas_var"] = temp["distance"].var()
arr["aim_distance_last"] = test["distance_aim_deltas"].values[-1]
arr["aim_distance_diff_max"] = distance_aim_deltas_diff.max()
arr["aim_distance_diff_var"] = distance_aim_deltas_diff.var()
arr["mean_speed"] = temp["speed"].mean()
arr["median_speed"] = temp["speed"].median()
arr["var_speed"] = temp["speed"].var()
arr["max_angle"] = temp["angles"].max()
arr["var_angle"] = temp["angles"].var()
arr["kurt_angle"] = temp["angles"].kurt()
arr["y_min"] = test["y"].min()
arr["y_max"] = test["y"].max()
arr["y_var"] = test["y"].var()
arr["y_mean"] = test["y"].mean()
arr["x_min"] = test["x"].min()
arr["x_max"] = test["x"].max()
arr["x_var"] = test["x"].var()
arr["x_mean"] = test["x"].mean()
arr["x_back_num"] = min( (test["x"].diff(1).dropna() > 0).sum(), (test["x"].diff(1).dropna() < 0).sum())
arr["y_back_num"] = min( (test["y"].diff(1).dropna() > 0).sum(), (test["y"].diff(1).dropna() < 0).sum())
arr["xs_delta_var"] = test["x"].diff(1).dropna().var()
arr["xs_delta_max"] = test["x"].diff(1).dropna().max()
arr["xs_delta_min"] =test["x"].diff(1).dropna().min()
# arr["label"]=test["label"]
a=pd.concat([a,arr])
return a
模型
###xgb
import xgboost as xgb
test_x=test.drop("id",1)
train_x=train.drop(["id","label"],1)
dtest = xgb.DMatrix(test_x)
# dval = xgb.DMatrix(val_x,label=val_data.label)
dtrain = xgb.DMatrix(train_x, label=train.label)
params={
"booster":"gbtree",
"objective": "binary:logistic",
# "scale_pos_weight": 1500.0/13458.0,
"eval_metric": "auc",
"gamma":0.1,#0.2 is ok
"max_depth":3,
# "lambda":550,
"subsample":0.7,
"colsample_bytree":0.4 ,
# "min_child_weight":2.5,
"eta": 0.007,
# "learning_rate":0.01,
"seed":1024,
"nthread":7,
}
watchlist = [(dtrain,"train"),
# (dval,"val")
]#The early stopping is based on last set in the evallist
model = xgb.train(
params,
dtrain,
feval=feval,
# maximize=False,
num_boost_round=1500,
# early_stopping_rounds=10,
# verbose_eval =30,
evals=watchlist
)
# model=xgb.XGBClassifier(
# max_depth=4,
# learning_rate=0.007,
# n_estimators=1500,
# silent=True,
# objective="binary:logistic",
# # booster="gbtree",
# # n_jobs=-1,
# nthread=7,
# # gamma=0,
# # min_child_weight=1,
# # max_delta_step=0,
# subsample=0.7,
# colsample_bytree=0.7,
# # colsample_bylevel=0.7,
# # reg_alpha=0,
# # reg_lambda=1,
# scale_pos_weight=1,
# base_score=0.5,
# # random_state=0,
# seed=1024,
# missing=None,
# )
# xgb.cv(params,dtrain,num_boost_round=1500,nfold=10,feval=feval,early_stopping_rounds=50,)
# model.save_model("./model/xgb.model")
# print "best best_ntree_limit",model.best_ntree_limit
评价函数
def eval(clf,x,y):
prob=clf.predict(x)
for i in range(len(prob)):
if prob[i]>=1:
prob[i]=1
else:
prob[i]=0
p=((y==0)&(prob==0)).sum()/(prob==0).sum()
print("TP"+" : "+str(((y==0)&(prob==0)).sum())+" "+"预测"+" : "+str((prob==0).sum())+" "+"真实"+" : "+str((y==0).sum()))
r=((y==0)&(prob==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return 0.0
f=5*p*r/(2*p+3*r)*100
print(f)
return f
def feval(pred,dtrain):
y=dtrain.get_label()
for i in range(len(pred)):
if pred[i]>=0.5:
pred[i]=1
else:
pred[i]=0
p=((y==0)&(pred==0)).sum()/(pred==0).sum()
print("---------------------------------------------------------")
# print("TP"+" : "+str(((y==0)&(pred==0)).sum())+" "+"预测"+" : "+str((pred==0).sum())+" "+"真实"+" : "+str((y==0).sum()))
r=((y==0)&(pred==0)).sum()/(y==0).sum()
if p==0 or r==0:
print(0.0)
return "f",0.0
f=5*p*r/(2*p+3*r)*100
print(f)
return "f",f
def target(score,num):
x=score*(40000+3*num)/5
return x
线下cv
from sklearn import cross_validation
score=cross_validation.cross_val_score(m,train.ix[:,1:-1],train.label,cv=10,scoring=eval)
score.mean()
提交结果
pred=model.predict(dtest)
test["prob"]=pred
submit=test.sort_values(by="prob").head(20000)
submit=submit[["id"]]
submit=submit.astype(int)
线上成绩0.91
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。