牛骨文教育服务平台(让学习变的简单)
博文笔记

Kaggle实战学习 笔记

创建时间:2017-11-27 投稿人: 浏览次数:1365

学习笔记

第一课

数据与可视化
#numpy科学计算工具箱
import numpy as np
#使用make_classification构造1000个样本,每个样本有20个feature
from sklearn.datasets import make_classification
X, y = make_classification(1000, n_features=20, n_informative=2, 
                           n_redundant=2, n_classes=2, random_state=0)
#存为dataframe格式
from pandas import DataFrame
df = DataFrame(np.hstack((X, y[:, None])),columns = range(20) + ["class"])  #注意hstack
df[:6]

import matplotlib.pyplot as plt
import seaborn as sns
#使用pairplot去看不同特征维度pair下数据的空间分布状况
_ = sns.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5)
plt.show()

import matplotlib.pyplot as plt
plt.figure(figsize=(12, 10))
_ = sns.corrplot(df, annot=False)  #新版本中移除此函数
plt.show()

修改学习曲线
from sklearn.svm import LinearSVC
from sklearn.learning_curve import learning_curve
#绘制学习曲线,以确定模型的状况
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    """

    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, "o-", color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, "o-", color="g",
             label="Cross-validation score")

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid("on") 
    if ylim:
        plt.ylim(ylim)
    plt.title(title)
    plt.show()

#少样本的情况情况下绘出学习曲线
plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",
                    X, y, ylim=(0.8, 1.01),
                    train_sizes=np.linspace(.05, 0.2, 5))

#增大一些样本量
plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",
                    X, y, ylim=(0.8, 1.1),
                    train_sizes=np.linspace(.1, 1.0, 5))

plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0) Features: 11&14", X[:, [11, 14]], y, ylim=(0.8, 1.0), train_sizes=np.linspace(.05, 0.2, 5))


模型融合:stacking融合方法

"""Kaggle competition: Predicting a Biological Response.

Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
[0,1]. The blending scheme is related to the idea Jose H. Solorzano
presented here:
http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
"""You can try this: In one of the 5 folds, train the models, then use
the results of the models as "variables" in logistic regression over
the validation data of that fold""". Or at least this is the
implementation of my understanding of that idea :-)

The predictions are saved in test.csv. The code below created my best
submission to the competition:
- public score (25%): 0.43464
- private score (75%): 0.37751
- final rank on the private leaderboard: 17th over 711 teams :-)

Note: if you increase the number of estimators of the classifiers,
e.g. n_estimators=1000, you get a better score/rank on the private
test set.

Copyright 2012, Emanuele Olivetti.
BSD license, 3 clauses.
"""

from __future__ import division
import numpy as np
import load_data
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)    #这个方法会给出一个区间,在区间之外的数字将被剪除到区间的边缘,例如给定一个区间[0,1],则小于0的将变成0,大于1则变成1.
    return - np.mean(actual * np.log(attempt) +          
                     (1.0 - actual) * np.log(1.0 - attempt))       ##注意logLoss的具体写法


if __name__ == "__main__":

    np.random.seed(0)  # seed to shuffle the train set

    n_folds = 10
    verbose = True
    shuffle = False

    X, y, X_submission = load_data.load()        

    if shuffle:
        idx = np.random.permutation(y.size)           #产生随机数
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))            #分层KFold

    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="gini"),
            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="entropy"),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion="gini"),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion="entropy"),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

    print "Creating train and test sets for blending."

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):       #注意此种写法,enumerate
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

    print "Saving Results."
    tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
    np.savetxt(fname="submission.csv", X=tmp, fmt="%d,%0.9f",             #保存为文本
               header="MoleculeId,PredictedProbability", comments="")


# 基本CSV读写操作
# 我们需要读取给定的训练数据,再进行后续的数据(特征等)处理

def read_data(file_name):
    f = open(file_name)
    #ignore header
    f.readline()
    samples = []
    target = []
    for line in f:
        line = line.strip().split(",")
        sample = [float(x) for x in line]
        samples.append(sample)
    return samples

def write_delimited_file(file_path, data,header=None, delimiter=","):
    f_out = open(file_path,"w")
    if header is not None:
        f_out.write(delimiter.join(header) + "
")
    for line in data:
        if isinstance(line, str):            ## 注意需要判断是否是实例, isinstance
            f_out.write(line + "
")
        else:
            f_out.write(delimiter.join(line) + "
")
    f_out.close()

#!/usr/bin/env python



bio competition  https://www.kaggle.com/c/bioresponse#description
from sklearn.linear_model import LogisticRegression
import csv_io                ##此模块?
import math
import scipy

def train_and_predict():
    #read in the training file
    train = read_data("train.csv")         #使用read_data 在csv_io 自写模块中
    print "读取训练数据完毕
...
"
    #set the training responses
    target = [x[0] for x in train]
    #set the training features
    train = [x[1:] for x in train]
    #read in the test file
    realtest = read_data("test.csv")
    print "读取待预测数据
...
"

    # code for logistic regression
    lr = LogisticRegression()
    lr.fit(train, target)
    print "Logistic Regression训练完毕!
...
"
    predicted_probs = lr.predict_proba(realtest)

    # write solutions to file
    predicted_probs = ["%f" % x[1] for x in predicted_probs]
    write_delimited_file("lr_solution.csv", predicted_probs)

    print "Logistic Regression预测完毕! 请提交lr_solution.csv文件到Kaggle"

if __name__=="__main__":
    train_and_predict()
	
	
Kaggle旧金山犯罪类型分类问题,https://www.kaggle.com/c/sf-crime
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import numpy as np

# 先了解自己的数据
train = pd.read_csv("sf_data/train.csv", parse_dates=["Dates"])   # 注意设置时间
test = pd.read_csv("sf_data/test.csv", parse_dates=["Dates"])

train.head()
test.head()
all_addr = np.array(train.Address.tolist() + test.Address.tolist())

list(all_addr)

stop_words = ["dr", "wy", "bl", "av", "st", "ct", "ln", "block", "of"]
vectorizer = CountVectorizer(max_features=300, stop_words=stop_words)
features = vectorizer.fit_transform(all_addr).toarray()        # 稀疏矩阵用toarray()转化为矩阵
features[0,:]


X = features[:train.shape[0]]
y = train.Category

#分成80%的训练集和20%的验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

log_model = LogisticRegression().fit(X=X_train, y=y_train)

results = log_model.predict_proba(X_test)

np.round(results[1], 3)

log_loss_score = log_loss(y_test, results)
print("log loss score: {0}".format(round(log_loss_score, 3)))


log_model = LogisticRegression().fit(X=features[:train.shape[0]], y=train.Category)
results = log_model.predict_proba(features[train.shape[0]:])
results


submission = pd.DataFrame(results)      #注意转换为DataFrame,然后进行下面的一系列操作
submission.columns = sorted(train.Category.unique())
submission.set_index(test.Id)   #set_index
submission.index.name="Id"
submission.to_csv("py_submission_logreg_addr_300.csv")

经典又兼具备趣味性的Kaggle案例  https://www.kaggle.com/c/titanic 

# 这个ipython notebook主要是我解决Kaggle Titanic问题的思路和过程

import pandas as pd #数据分析
import numpy as np #科学计算
from pandas import Series,DataFrame

data_train = pd.read_csv("Train.csv")
data_train.columns
#data_train[data_train.Cabin.notnull()]["Survived"].value_counts()  

data_train.info()
data_train.describe()


import matplotlib.pyplot as plt
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

plt.subplot2grid((2,3),(0,0))             # 在一张大图里分列几个小图
data_train.Survived.value_counts().plot(kind="bar")# plots a bar graph of those who surived vs those who did not. 
plt.title(u"获救情况 (1为获救)") # puts a title on our graph
plt.ylabel(u"人数")  

plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.ylabel(u"人数")
plt.title(u"乘客等级分布")

plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.ylabel(u"年龄")                         # sets the y axis lable
plt.grid(b=True, which="major", axis="y") # formats the grid line style of our graphs
plt.title(u"按年龄看获救分布 (1为获救)")


plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind="kde")   # plots a kernel desnsity estimate of the subset of the 1st class passanges"s age
data_train.Age[data_train.Pclass == 2].plot(kind="kde")
data_train.Age[data_train.Pclass == 3].plot(kind="kde")
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度") 
plt.title(u"各等级的乘客年龄分布")
plt.legend((u"头等舱", u"2等舱",u"3等舱"),loc="best") # sets our legend for our graph.


plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind="bar")
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")  
plt.show()


#看看各乘客等级的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u"获救":Survived_1, u"未获救":Survived_0})
df.plot(kind="bar", stacked=True)
plt.title(u"各乘客等级的获救情况")
plt.xlabel(u"乘客等级") 
plt.ylabel(u"人数") 

plt.show()


#看看各登录港口的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u"获救":Survived_1, u"未获救":Survived_0})
df.plot(kind="bar", stacked=True)
plt.title(u"各登录港口乘客的获救情况")
plt.xlabel(u"登录港口") 
plt.ylabel(u"人数") 

plt.show()

#看看各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_m = data_train.Survived[data_train.Sex == "male"].value_counts()
Survived_f = data_train.Survived[data_train.Sex == "female"].value_counts()
df=pd.DataFrame({u"男性":Survived_m, u"女性":Survived_f})
df.plot(kind="bar", stacked=True)
plt.title(u"按性别看获救情况")
plt.xlabel(u"性别") 
plt.ylabel(u"人数")
plt.show()

#然后我们再来看看各种舱级别情况下各性别的获救情况
fig=plt.figure()
fig.set(alpha=0.65) # 设置图像透明度,无所谓
plt.title(u"根据舱等级和性别的获救情况")

ax1=fig.add_subplot(141)
data_train.Survived[data_train.Sex == "female"][data_train.Pclass != 3].value_counts().plot(kind="bar", label="female highclass", color="#FA2479")
ax1.set_xticklabels([u"获救", u"未获救"], rotation=0)
ax1.legend([u"女性/高级舱"], loc="best")

ax2=fig.add_subplot(142, sharey=ax1)
data_train.Survived[data_train.Sex == "female"][data_train.Pclass == 3].value_counts().plot(kind="bar", label="female, low class", color="pink")
ax2.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"女性/低级舱"], loc="best")

ax3=fig.add_subplot(143, sharey=ax1)
data_train.Survived[data_train.Sex == "male"][data_train.Pclass != 3].value_counts().plot(kind="bar", label="male, high class",color="lightblue")
ax3.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/高级舱"], loc="best")

ax4=fig.add_subplot(144, sharey=ax1)
data_train.Survived[data_train.Sex == "male"][data_train.Pclass == 3].value_counts().plot(kind="bar", label="male low class", color="steelblue")
ax4.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/低级舱"], loc="best")

plt.show()

g = data_train.groupby(["SibSp","Survived"])          # 注意分组统计,多层分组的应用
df = pd.DataFrame(g.count()["PassengerId"])          ###使用此种方式进行多层分组统计

data_train.Cabin.value_counts()  # value_counts 应用, 和count区别 values_counts分类了, count没有分类,统一统计了

#cabin的值计数太分散了,绝大多数Cabin值只出现一次。感觉上作为类目,加入特征未必会有效
#那我们一起看看这个值的有无,对于survival的分布状况,影响如何吧
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df=pd.DataFrame({u"有":Survived_cabin, u"无":Survived_nocabin}).transpose()   ##注意用词典的方式进行画图。用上面过两维,和有无两维,组成了类似2*2交叉表,此处需要transpose,作图的时候
df.plot(kind="bar", stacked=True)
plt.title(u"按Cabin有无看获救情况")
plt.xlabel(u"Cabin有无") 
plt.ylabel(u"人数")
plt.show()

#似乎有cabin记录的乘客survival比例稍高,那先试试把这个值分为两类,有cabin值/无cabin值,一会儿加到类别特征好了

from sklearn.ensemble import RandomForestRegressor


 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[["Age","Fare", "Parch", "SibSp", "Pclass"]]   #注意使用此种方式固定列,便于以后操作

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()     # 注意notnull, isnull 的用法,并且转化为矩阵
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])   #此种写法,不包含第一列
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), "Age" ] = predictedAges     # 用loc取
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), "Cabin" ] = "Yes"
    df.loc[ (df.Cabin.isnull()), "Cabin" ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train

# 因为逻辑回归建模时,需要输入的特征都是数值型特征
# 我们先对类目型的特征离散/因子化
# 以Cabin为例,原本一个属性维度,因为其取值可以是["yes","no"],而将其平展开为"Cabin_yes","Cabin_no"两个属性
# 原本Cabin取值为yes的,在此处的"Cabin_yes"下取值为1,在"Cabin_no"下取值为0
# 原本Cabin取值为no的,在此处的"Cabin_yes"下取值为0,在"Cabin_no"下取值为1
# 我们使用pandas的get_dummies来完成这个工作,并拼接在原来的data_train之上,如下所示
dummies_Cabin = pd.get_dummies(data_train["Cabin"], prefix= "Cabin")

dummies_Embarked = pd.get_dummies(data_train["Embarked"], prefix= "Embarked")   #get_dummies的使用方法, 

dummies_Sex = pd.get_dummies(data_train["Sex"], prefix= "Sex")

dummies_Pclass = pd.get_dummies(data_train["Pclass"], prefix= "Pclass")

df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)     #注意最后用pd的concat 连接, numpy有concatenate连接方法
df.drop(["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)   #注意inplace, 删除列用drop
df

# 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内
# 这样可以加速logistic regression的收敛
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df["Age"])
df["Age_scaled"] = scaler.fit_transform(df["Age"], age_scale_param)       #注意这种写法和普通的不同,Age,Fare开始使用相同的scaler,但又使用同的训练数据,所以把训练完的对象又单独加入fit_transform的参数中
fare_scale_param = scaler.fit(df["Fare"])
df["Fare_scaled"] = scaler.fit_transform(df["Fare"], fare_scale_param)
df


# 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模
from sklearn import linear_model

train_df = df.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*")   # 使用正则表达式过滤,filter(regex=  |)
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
clf.fit(X, y)   
clf

X.shape

#测试集和训练集做一样的操作
data_test = pd.read_csv("test.csv")
data_test.loc[ (data_test.Fare.isnull()), "Fare" ] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[["Age","Fare", "Parch", "SibSp", "Pclass"]]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)   #注意此处是训练集得来的模型
data_test.loc[ (data_test.Age.isnull()), "Age" ] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test["Cabin"], prefix= "Cabin")
dummies_Embarked = pd.get_dummies(data_test["Embarked"], prefix= "Embarked")
dummies_Sex = pd.get_dummies(data_test["Sex"], prefix= "Sex")
dummies_Pclass = pd.get_dummies(data_test["Pclass"], prefix= "Pclass")


df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)
df_test["Age_scaled"] = scaler.fit_transform(df_test["Age"], age_scale_param)       #来此训练集
df_test["Fare_scaled"] = scaler.fit_transform(df_test["Fare"], fare_scale_param)
df_test

test = df_test.filter(regex="Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*")
predictions = clf.predict(test)
result = pd.DataFrame({"PassengerId":data_test["PassengerId"].as_matrix(), "Survived":predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)

pd.read_csv("logistic_regression_predictions.csv")





import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve

# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)  #注意此处的train_sizes是0.05到1的值,如何为实际
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:          #注意在Python中None和NULL的区分
            plt.ylim(*ylim)   #平方??
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()         ###利用gca()获得ax的属性,然后利用invert_yaxis() 反向Y轴
        plt.grid()
    
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, "o-", color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, "o-", color="r", label=u"交叉验证集上得分")
    
        plt.legend(loc="best")
        
        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

plot_learning_curve(clf, u"学习曲线", X, y)


pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})


from sklearn import cross_validation

# 简单看看打分情况
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
all_data = df.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*")
X = all_data.as_matrix()[:,1:]
y = all_data.as_matrix()[:,0]
print cross_validation.cross_val_score(clf, X, y, cv=5)


# 分割数据
split_train, split_cv = cross_validation.train_test_split(df, test_size=0.3, random_state=0)
train_df = split_train.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*")
# 生成模型
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])


# 对cross validation数据进行预测

cv_df = split_cv.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*")
predictions = clf.predict(cv_df.as_matrix()[:,1:])
split_cv[ predictions != cv_df.as_matrix()[:,0] ].drop()


# 去除预测错误的case看原始dataframe数据
#split_cv["PredictResult"] = predictions
origin_data_train = pd.read_csv("Train.csv")
bad_cases = origin_data_train.loc[origin_data_train["PassengerId"].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]["PassengerId"].values)]  #注意去除写法 isin
bad_cases

data_train[data_train["Name"].str.contains("Major")]



data_train = pd.read_csv("Train.csv")
data_train["Sex_Pclass"] = data_train.Sex + "_" + data_train.Pclass.map(str)

from sklearn.ensemble import RandomForestRegressor
 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[["Age","Fare", "Parch", "SibSp", "Pclass"]]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), "Age" ] = predictedAges 
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), "Cabin" ] = "Yes"
    df.loc[ (df.Cabin.isnull()), "Cabin" ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

dummies_Cabin = pd.get_dummies(data_train["Cabin"], prefix= "Cabin")
dummies_Embarked = pd.get_dummies(data_train["Embarked"], prefix= "Embarked")
dummies_Sex = pd.get_dummies(data_train["Sex"], prefix= "Sex")
dummies_Pclass = pd.get_dummies(data_train["Pclass"], prefix= "Pclass")
dummies_Sex_Pclass = pd.get_dummies(data_train["Sex_Pclass"], prefix= "Sex_Pclass")   #新增加特征


df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df.drop(["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked", "Sex_Pclass"], axis=1, inplace=True)
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df["Age"])
df["Age_scaled"] = scaler.fit_transform(df["Age"], age_scale_param)
fare_scale_param = scaler.fit(df["Fare"])
df["Fare_scaled"] = scaler.fit_transform(df["Fare"], fare_scale_param)



from sklearn import linear_model

train_df = df.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*")
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
clf.fit(X, y)
clf




data_test = pd.read_csv("test.csv")
data_test.loc[ (data_test.Fare.isnull()), "Fare" ] = 0
data_test["Sex_Pclass"] = data_test.Sex + "_" + data_test.Pclass.map(str)
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[["Age","Fare", "Parch", "SibSp", "Pclass"]]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), "Age" ] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test["Cabin"], prefix= "Cabin")
dummies_Embarked = pd.get_dummies(data_test["Embarked"], prefix= "Embarked")
dummies_Sex = pd.get_dummies(data_test["Sex"], prefix= "Sex")
dummies_Pclass = pd.get_dummies(data_test["Pclass"], prefix= "Pclass")
dummies_Sex_Pclass = pd.get_dummies(data_test["Sex_Pclass"], prefix= "Sex_Pclass")


df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df_test.drop(["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked", "Sex_Pclass"], axis=1, inplace=True)
df_test["Age_scaled"] = scaler.fit_transform(df_test["Age"], age_scale_param)
df_test["Fare_scaled"] = scaler.fit_transform(df_test["Fare"], fare_scale_param)
df_test


test = df_test.filter(regex="Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*")
predictions = clf.predict(test)
result = pd.DataFrame({"PassengerId":data_test["PassengerId"].as_matrix(), "Survived":predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions2.csv", index=False)


from sklearn.ensemble import BaggingRegressor

train_df = df.filter(regex="Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title")
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y) #用同一个模型,数据集分为10份

test = df_test.filter(regex="Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title")
predictions = bagging_clf.predict(test)
result = pd.DataFrame({"PassengerId":data_test["PassengerId"].as_matrix(), "Survived":predictions.astype(np.int32)})
result.to_csv("/Users/MLS/Downloads/logistic_regression_predictions2.csv", index=False)


用两个分类器
import numpy as np
import pandas as pd
from pandas import  DataFrame
from patsy import dmatrices   #用于生成设计矩阵
import string
from operator import itemgetter
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.externals import joblib      #持久化模块

##Read configuration parameters

train_file="train.csv"
MODEL_PATH="./"
test_file="test.csv"
SUBMISSION_PATH="./"
seed= 0

print train_file,seed

# 输出得分
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]    #注意此种排序的写法
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

#清理和处理数据
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:  ##注意!=-1此种写法
            return substring
    print big_string
    return np.nan

le = preprocessing.LabelEncoder()   #标签编码
enc=preprocessing.OneHotEncoder()   #OneHot编码

def clean_and_munge_data(df):
    #处理缺省值
    df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)   #注意此种写法缺失值转为为0的写法
    #处理一下名字,生成Title字段
    title_list=["Mrs", "Mr", "Master", "Miss", "Major", "Rev",
                "Dr", "Ms", "Mlle","Col", "Capt", "Mme", "Countess",
                "Don", "Jonkheer"]
    df["Title"]=df["Name"].map(lambda x: substrings_in_string(x, title_list))  #lambda 和map结合在处理缺失值的妙用

    #处理特殊的称呼,全处理成mr, mrs, miss, master
    def replace_titles(x):
        title=x["Title"]
        if title in ["Mr","Don", "Major", "Capt", "Jonkheer", "Rev", "Col"]:
            return "Mr"
        elif title in ["Master"]:
            return "Master"
        elif title in ["Countess", "Mme","Mrs"]:
            return "Mrs"
        elif title in ["Mlle", "Ms","Miss"]:
            return "Miss"
        elif title =="Dr":
            if x["Sex"]=="Male":
                return "Mr"
            else:
                return "Mrs"
        elif title =="":
            if x["Sex"]=="Male":
                return "Master"
            else:
                return "Miss"
        else:
            return title

    df["Title"]=df.apply(replace_titles, axis=1)  #apply(func,args,kwargs)从Python2.3开始,已经被func(*args,**kwargs)代替了.

    #看看家族是否够大,咳咳
    df["Family_Size"]=df["SibSp"]+df["Parch"]
    df["Family"]=df["SibSp"]*df["Parch"]


    df.loc[ (df.Fare.isnull())&(df.Pclass==1),"Fare"] =np.median(df[df["Pclass"] == 1]["Fare"].dropna())  #众数填充
    df.loc[ (df.Fare.isnull())&(df.Pclass==2),"Fare"] =np.median( df[df["Pclass"] == 2]["Fare"].dropna())
    df.loc[ (df.Fare.isnull())&(df.Pclass==3),"Fare"] = np.median(df[df["Pclass"] == 3]["Fare"].dropna())

    df["Gender"] = df["Sex"].map( {"female": 0, "male": 1} ).astype(int)       #注意map内部是字典

    df["AgeFill"]=df["Age"]
    mean_ages = np.zeros(4)
    mean_ages[0]=np.average(df[df["Title"] == "Miss"]["Age"].dropna())
    mean_ages[1]=np.average(df[df["Title"] == "Mrs"]["Age"].dropna())
    mean_ages[2]=np.average(df[df["Title"] == "Mr"]["Age"].dropna())
    mean_ages[3]=np.average(df[df["Title"] == "Master"]["Age"].dropna())
    df.loc[ (df.Age.isnull()) & (df.Title == "Miss") ,"AgeFill"] = mean_ages[0]
    df.loc[ (df.Age.isnull()) & (df.Title == "Mrs") ,"AgeFill"] = mean_ages[1]
    df.loc[ (df.Age.isnull()) & (df.Title == "Mr") ,"AgeFill"] = mean_ages[2]
    df.loc[ (df.Age.isnull()) & (df.Title == "Master") ,"AgeFill"] = mean_ages[3]

    df["AgeCat"]=df["AgeFill"]
    df.loc[ (df.AgeFill<=10) ,"AgeCat"] = "child"
    df.loc[ (df.AgeFill>60),"AgeCat"] = "aged"
    df.loc[ (df.AgeFill>10) & (df.AgeFill <=30) ,"AgeCat"] = "adult"
    df.loc[ (df.AgeFill>30) & (df.AgeFill <=60) ,"AgeCat"] = "senior"

    df.Embarked = df.Embarked.fillna("S")


    df.loc[ df.Cabin.isnull()==True,"Cabin"] = 0.5
    df.loc[ df.Cabin.isnull()==False,"Cabin"] = 1.5

    df["Fare_Per_Person"]=df["Fare"]/(df["Family_Size"]+1)

    #Age times class

    df["AgeClass"]=df["AgeFill"]*df["Pclass"]
    df["ClassFare"]=df["Pclass"]*df["Fare_Per_Person"]


    df["HighLow"]=df["Pclass"]
    df.loc[ (df.Fare_Per_Person<8) ,"HighLow"] = "Low"
    df.loc[ (df.Fare_Per_Person>=8) ,"HighLow"] = "High"



    le.fit(df["Sex"] )
    x_sex=le.transform(df["Sex"])
    df["Sex"]=x_sex.astype(np.float)

    le.fit( df["Ticket"])
    x_Ticket=le.transform( df["Ticket"])
    df["Ticket"]=x_Ticket.astype(np.float)

    le.fit(df["Title"])
    x_title=le.transform(df["Title"])
    df["Title"] =x_title.astype(np.float)

    le.fit(df["HighLow"])
    x_hl=le.transform(df["HighLow"])
    df["HighLow"]=x_hl.astype(np.float)


    le.fit(df["AgeCat"])
    x_age=le.transform(df["AgeCat"])
    df["AgeCat"] =x_age.astype(np.float)

    le.fit(df["Embarked"])
    x_emb=le.transform(df["Embarked"])
    df["Embarked"]=x_emb.astype(np.float)

    df = df.drop(["PassengerId","Name","Age","Cabin"], axis=1) #remove Name,Age and PassengerId


    return df

#读取数据
traindf=pd.read_csv(train_file)
##清洗数据
df=clean_and_munge_data(traindf)
########################################formula################################
 
formula_ml="Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size" #这一部重要,需要研究

y_train, x_train = dmatrices(formula_ml, data=df, return_type="dataframe")   # 生成矩阵,根据参数见的关系生成。参数之间相关性。 
y_train = np.asarray(y_train).ravel()
print y_train.shape,x_train.shape

##选择训练和测试集
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2,random_state=seed)
#初始化分类器
clf=RandomForestClassifier(n_estimators=500, criterion="entropy", max_depth=5, min_samples_split=1,
  min_samples_leaf=1, max_features="auto",    bootstrap=False, oob_score=False, n_jobs=1, random_state=seed,
  verbose=0)

###grid search找到最好的参数
param_grid = dict( )
##创建分类pipeline
pipeline=Pipeline([ ("clf",clf) ])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3,scoring="accuracy",
cv=StratifiedShuffleSplit(Y_train, n_iter=10, test_size=0.2, train_size=None, indices=None,    #CV嵌套在GridSearch里面,CV使用Y_train分割
random_state=seed, n_iterations=None)).fit(X_train, Y_train)
# 对结果打分
print("Best score: %0.3f" % grid_search.best_score_)      #注意,best_score_
print(grid_search.best_estimator_)
report(grid_search.grid_scores_)
 
print("-----grid search end------------")
print ("on all train set")
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train,cv=3,scoring="accuracy") #全量, 注意best_estimator
print scores.mean(),scores
print ("on test set")
scores = cross_val_score(grid_search.best_estimator_, X_test, Y_test,cv=3,scoring="accuracy")
print scores.mean(),scores

# 对结果打分

print(classification_report(Y_train, grid_search.best_estimator_.predict(X_train) ))
print("test data")
print(classification_report(Y_test, grid_search.best_estimator_.predict(X_test) ))

model_file=MODEL_PATH+"model-rf.pkl"
joblib.dump(grid_search.best_estimator_, model_file)


Kaggle自行车租赁预测比赛   https://www.kaggle.com/c/bike-sharing-demand
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df_train = pd.read_csv("kaggle_bike_competition_train.csv",header = 0)

df_train.head(10)

df_train.dtypes

#让它告诉我们形状
df_train.shape
df_train.count()

type(df_train.datetime)

# 把月、日、和 小时单独拎出来,放到3列中
df_train["month"] = pd.DatetimeIndex(df_train.datetime).month       #处理时间,使用pd.DatetimeIndex().month
df_train["day"] = pd.DatetimeIndex(df_train.datetime).dayofweek
df_train["hour"] = pd.DatetimeIndex(df_train.datetime).hour

# 那个,保险起见,咱们还是先存一下吧
df_train_origin = df_train
# 抛掉不要的字段
df_train = df_train.drop(["datetime","casual","registered"], axis = 1)

# 看一眼
df_train.head(5)

df_train_target = df_train["count"].values       #注意后面加了value
df_train_data = df_train.drop(["count"],axis = 1).values
print "df_train_data shape is ", df_train_data.shape
print "df_train_target shape is ", df_train_target.shape

from sklearn import linear_model
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import explained_variance_score

# 总得切分一下数据咯(训练集和测试集)
cv = cross_validation.ShuffleSplit(len(df_train_data), n_iter=3, test_size=0.2,   # 注意在此处使用的是len,最终使用的是索引
    random_state=0)

# 各种模型来一圈

print "岭回归"    
for train, test in cv:    
    svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}
".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print "支持向量回归/SVR(kernel="rbf",C=10,gamma=.001)"
for train, test in cv:
    
    svc = svm.SVR(kernel ="rbf", C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}
".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print "随机森林回归/Random Forest(n_estimators = 100)"    
for train, test in cv:    
    svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}
".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))


X = df_train_data
y = df_train_target

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=0)

tuned_parameters = [{"n_estimators":[10,100,500]}]   
    
scores = ["r2"]

for score in scores:
    
    print score
    
    clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("别!喝!咖!啡!了!最佳参数找到了亲!!:")
    print ""
    #best_estimator_ returns the best estimator chosen by the search
    print(clf.best_estimator_)
    print ""
    print("得分分别是:")
    print ""
    #grid_scores_的返回值:
    #    * a dict of parameter settings
    #    * the mean score over the cross-validation folds 
    #    * the list of scores for each fold
    for params, mean_score, scores in clf.grid_scores_:   # grid_scores_里面只有测试集的分数?
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print ""



def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, "o-", color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, "o-", color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves (Random Forest, n_estimators = 100)"
cv = cross_validation.ShuffleSplit(df_train_data.shape[0], n_iter=10,test_size=0.2, random_state=0)
estimator = RandomForestRegressor(n_estimators = 100)
plot_learning_curve(estimator, title, X, y, (0.0, 1.01), cv=cv, n_jobs=4)

plt.show()

# 尝试一下缓解过拟合,当然,未必成功
print "随机森林回归/Random Forest(n_estimators=200, max_features=0.6, max_depth=15)"   # 这里调高了n_estimators,max_ 的数量,在Random Forest里面降低模型复杂度?
for train, test in cv: 
    svc = RandomForestRegressor(n_estimators = 200, max_features=0.6, max_depth=15).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}
".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))

# 看你们自己的咯
df_train_registered = df_train_origin.drop(["datetime","casual","count"], axis = 1)
df_train_casual = df_train_origin.drop(["datetime","count","registered"], axis = 1)

df_train_registered.head()

# 风速
df_train_origin.groupby("windspeed").mean().plot(y="count", marker="o")  #注意groupby分组统计后直接作图
plt.show()

# 湿度
df_train_origin.groupby("humidity").mean().plot(y="count", marker="o")
plt.show()

#温度湿度变化
df_train_origin.plot(x="temp", y="humidity", kind="scatter")           #直接作图
plt.show()


# scatter一下各个维度
fig, axs = plt.subplots(2, 3, sharey=True)
df_train_origin.plot(kind="scatter", x="temp", y="count", ax=axs[0, 0], figsize=(16, 8), color="magenta")
df_train_origin.plot(kind="scatter", x="atemp", y="count", ax=axs[0, 1], color="cyan")
df_train_origin.plot(kind="scatter", x="humidity", y="count", ax=axs[0, 2], color="red")
df_train_origin.plot(kind="scatter", x="windspeed", y="count", ax=axs[1, 0], color="yellow")
df_train_origin.plot(kind="scatter", x="month", y="count", ax=axs[1, 1], color="blue")
df_train_origin.plot(kind="scatter", x="hour", y="count", ax=axs[1, 2], color="green")

sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count")  # 注意seabosn中的pairplot 画多个变量之间的关系
corr = df_train_origin[["temp","weather","windspeed","day", "month", "hour","count"]].corr() # corr计算各特征变量之间的关联度
corr

plt.figure()
plt.matshow(corr)         # 显示相关性图,matshow
plt.colorbar()          # 颜色par
plt.show()

特征工程 数据集来源于Data Hackathon 3.x

import pandas as pd
import numpy as np
%matplotlib inline

#载入数据:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

train.dtypes

train.head(5)

#合成一个总的data
train["source"]= "train"
test["source"] = "test"
data=pd.concat([train, test],ignore_index=True)       # 合成的使用用pandas的concate 或python的。。。
data.shape

data.apply(lambda x: sum(x.isnull()))  # 注意用此方式查看缺省值  

var = ["Gender","Salary_Account","Mobile_Verified","Var1","Filled_Form","Device_Type","Var2","Source"]
for v in var:
    print "
%s这一列数据的不同取值和出现的次数
"%v
    print data[v].value_counts()
	
	
len(data["City"].unique())  # 注意unique的使用
data.drop("City",axis=1,inplace=True)

data["DOB"].head()

#创建一个年龄的字段Age
data["Age"] = data["DOB"].apply(lambda x: 115 - int(x[-2:]))
data["Age"].head()

#把原始的DOB字段去掉:
data.drop("DOB",axis=1,inplace=True)

data.boxplot(column=["EMI_Loan_Submitted"],return_type="axes")

#好像缺失值比较多,干脆就开一个新的字段,表明是缺失值还是不是缺失值
data["EMI_Loan_Submitted_Missing"] = data["EMI_Loan_Submitted"].apply(lambda x: 1 if pd.isnull(x) else 0)
data[["EMI_Loan_Submitted","EMI_Loan_Submitted_Missing"]].head(10)

#原始那一列就可以不要了
data.drop("EMI_Loan_Submitted",axis=1,inplace=True)

len(data["Employer_Name"].value_counts())

#丢掉
data.drop("Employer_Name",axis=1,inplace=True)

data.boxplot(column="Existing_EMI",return_type="axes")

data["Existing_EMI"].describe()

#缺省值不多,用均值代替
data["Existing_EMI"].fillna(0, inplace=True)

data.boxplot(column=["Interest_Rate"],return_type="axes")

#缺省值太多,也造一个字段,表示有无
data["Interest_Rate_Missing"] = data["Interest_Rate"].apply(lambda x: 1 if pd.isnull(x) else 0)    #造一个有无的字段
print data[["Interest_Rate","Interest_Rate_Missing"]].head(10)

data.drop("Interest_Rate",axis=1,inplace=True)

#找中位数去填补缺省值(因为缺省的不多)
data["Loan_Amount_Applied"].fillna(data["Loan_Amount_Applied"].median(),inplace=True)
data["Loan_Tenure_Applied"].fillna(data["Loan_Tenure_Applied"].median(),inplace=True)


# 缺省值太多。。。是否缺省。。。
data["Loan_Amount_Submitted_Missing"] = data["Loan_Amount_Submitted"].apply(lambda x: 1 if pd.isnull(x) else 0)
data["Loan_Tenure_Submitted_Missing"] = data["Loan_Tenure_Submitted"].apply(lambda x: 1 if pd.isnull(x) else 0)

data["Source"] = data["Source"].apply(lambda x: "others" if x not in ["S122","S133"] else x)
data["Source"].value_counts()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()        # 数值编码, 原来为object类型,转化为int类型
var_to_encode = ["Device_Type","Filled_Form","Gender","Var1","Var2","Mobile_Verified","Source"]
for col in var_to_encode:
    data[col] = le.fit_transform(data[col])

data = pd.get_dummies(data, columns=var_to_encode)  #类别型的One-Hot 编码, 此处先把类别行的用LabelEncoder编码为数字,然后在转化为one_hot编码,可以直接one_hot,只是起的列名字不同而已
data.columns	
	
train = data.loc[data["source"]=="train"]
test = data.loc[data["source"]=="test"]


XGBoost模型调优 

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams["figure.figsize"] = 12, 4

train = pd.read_csv("train_modified.csv")
test = pd.read_csv("test_modified.csv")

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams["figure.figsize"] = 12, 4           #注意此处, 默认设置图形大小


train = pd.read_csv("train_modified.csv")
test = pd.read_csv("test_modified.csv")

train.shape, test.shape

target="Disbursed"
IDcol = "ID"

train["Disbursed"].value_counts()

#test_results = pd.read_csv("test_results.csv")
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        xgtest = xgb.DMatrix(dtest[predictors].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()["n_estimators"], nfold=cv_folds,
             early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #建模
    alg.fit(dtrain[predictors], dtrain["Disbursed"],eval_metric="auc")
        
    #对训练集预测
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #输出模型的一些结果
    print "
关于现在这个模型"
    print "准确率 : %.4g" % metrics.accuracy_score(dtrain["Disbursed"].values, dtrain_predictions)
    print "AUC 得分 (训练集): %f" % metrics.roc_auc_score(dtrain["Disbursed"], dtrain_predprob)
                
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind="bar", title="Feature Importances")
    plt.ylabel("Feature Importance Score")
	
	
	predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= "binary:logistic",
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, train, test, predictors)

#对subsample 和 max_features 用grid search查找最好的参数
param_test1 = {
    "max_depth":range(3,10,2),
    "min_child_weight":range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



# 对于max_depth和min_child_weight查找最好的参数
param_test2 = {
    "max_depth":[4,5,6],
    "min_child_weight":[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])



#交叉验证对min_child_weight寻找最合适的参数
param_test2b = {
    "min_child_weight":[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2b, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch2b.fit(train[predictors],train[target])



#Grid seach选择合适的gamma
param_test3 = {
    "gamma":[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test3, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])


predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb2 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=6,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= "binary:logistic",
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb2, train, test, predictors)



#对subsample 和 colsample_bytree用grid search寻找最合适的参数
param_test4 = {
    "subsample":[i/10.0 for i in range(6,10)],
    "colsample_bytree":[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])



# 同上
param_test5 = {
    "subsample":[i/100.0 for i in range(75,90,5)],
    "colsample_bytree":[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test5, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])



#对reg_alpha用grid search寻找最合适的参数
param_test6 = {
    "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test6, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch6.fit(train[predictors],train[target])



# 换一组参数对reg_alpha用grid search寻找最合适的参数
param_test7 = {
    "reg_alpha":[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= "binary:logistic", nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test7, scoring="roc_auc",n_jobs=4,iid=False, cv=5)
gsearch7.fit(train[predictors],train[target])

xgb3 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=6,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.005,
        objective= "binary:logistic",
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb3, train, test, predictors)

第二课

房价预测案例

import numpy as np
import pandas as pd
import xgboost

train_df = pd.read_csv("../input/train.csv", index_col=0)    #注意../ 代表上一个目录?
test_df = pd.read_csv("../input/test.csv", index_col=0)

%matplotlib inline
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])})   # 注意此处使用了lo1p,它是log(1+X) 防止X为零的一个类似拉普拉斯平滑,log1p()就需要expm1(); 回归的时候如果初始数据不是正态分布,需要做处理, 分类的时候没必要
prices.hist()

y_train = np.log1p(train_df.pop("SalePrice"))

all_df = pd.concat((train_df, test_df), axis=0)  # pandas 里面的concat 合并

all_df.shape

all_df["MSSubClass"].dtypes

all_df["MSSubClass"] = all_df["MSSubClass"].astype(str)  # 转换为astype

all_df["MSSubClass"].value_counts()

pd.get_dummies(all_df["MSSubClass"], prefix="MSSubClass").head()

all_dummy_df = pd.get_dummies(all_df)    #把所有的数据进行了one-hot-encode
all_dummy_df.head()

all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)  #缺失值, sum , sort_values

mean_cols = all_dummy_df.mean()
mean_cols.head(10)

all_dummy_df = all_dummy_df.fillna(mean_cols)
all_dummy_df.isnull().sum().sum()

numeric_cols = all_df.columns[all_df.dtypes != "object"]         #判断对那些列是numerical类型,即不是对象类型的, 注意此处使用的是dtypes!="object", 注意取出是个列表
numeric_cols


numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean()            #计算出所有数值型数字的标准列
numeric_col_std = all_dummy_df.loc[:, numeric_cols].std()             
all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std  # 标准化

dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]

from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

X_train = dummy_train_df.values    #此处是values 注意DataFrame 转化为narray的方式
X_test = dummy_test_df.values


alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
    clf = Ridge(alpha)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))  #注意此处有个负号, 和scoring 的选择有关
    test_scores.append(np.mean(test_score))
	
	import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error");

max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring="neg_mean_squared_error"))
    test_scores.append(np.mean(test_score))
	
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");

ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)

ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)

y_ridge = np.expm1(ridge.predict(X_test))   #注意前面用了log(x+1),此处我们用的expm1是反过程
y_rf = np.expm1(rf.predict(X_test))


y_final = (y_ridge + y_rf) / 2          #模型融合  取平均

submission_df = pd.DataFrame(data= {"Id" : test_df.index, "SalePrice": y_final})

房价预测案例(进阶版)

dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]

dummy_train_df.shape, dummy_test_df.shape

X_train = dummy_train_df.values
X_test = dummy_test_df.values

from sklearn.linear_model import Ridge
ridge = Ridge(15)

from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score

params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:                #此处bagging 用的是同一个模型下面不同分类器的组合
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))
    test_scores.append(np.mean(test_score))
	
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");


params = [10, 15, 20, 25, 30, 40, 50, 60, 70, 100]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))
    test_scores.append(np.mean(test_score))
	
	
from sklearn.ensemble import AdaBoostRegressor

params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
test_scores = []
for param in params:
    clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))
    test_scores.append(np.mean(test_score))
	

from xgboost import XGBRegressor

params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth=param)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))
    test_scores.append(np.mean(test_score))
	

用每日新闻预测金融市场变化 标准版
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from datetime import date

data = pd.read_csv("../input/Combined_News_DJIA.csv")

data["combined_news"] = data.filter(regex=("Top.*")).apply(lambda x: "".join(str(x.values)), axis=1)  #注意pd中使用filter.后使用apply, 在axis=1,此时lambda 中的x是一列一列,是每个Series

train = data[data["Date"] < "2015-01-01"]
test = data[data["Date"] > "2014-12-31"]

feature_extraction = TfidfVectorizer()
X_train = feature_extraction.fit_transform(train["combined_news"].values)
X_test = feature_extraction.transform(test["combined_news"].values)
	
y_train = train["Label"].values
y_test = test["Label"].values

clf = SVC(probability=True, kernel="rbf")

clf = SVC(probability=True, kernel="rbf")
clf.fit(X_train, y_train)

predictions = clf.predict_proba(X_test)

print("ROC-AUC yields " + str(roc_auc_score(y_test, predictions[:,1])))

进阶版
X_train = train["combined_news"].str.lower().str.replace(""", "").str.replace(""", "").str.split()
X_test = test["combined_news"].str.lower().str.replace(""", "").str.replace(""", "").str.split()

from nltk.corpus import stopwords
stop = stopwords.words("english")
 
 import re
def hasNumbers(inputString):
    return bool(re.search(r"d", inputString))
	
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    """
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。