Show Menu
Cheatography

Machine Learning Cheat Sheet (DRAFT) by

text & sentimental analytics

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Time Series

时间格式统一
df['Date'] = [datet­ime.st­rpt­ime(x, "­%b-­%y") for x in df.Period]
新建一列存放­时间,­依据原­本pe­riod列中 Jan-20­21的­日期形­式改成­202­1-0­1-0­1的标准形式
df_b['­Date'] = [datet­ime.st­rpt­ime(x, "­%d/­%m/­%y") for x in df_b.i­loc­[:,0]]
列名操作
df.columns = ["Pe­rio­d", "­Gol­dPr­ice­", "­Bon­dYi­eld­"]
更改列名
df.columns = [each.s­trip() for each in df.col­umns]
去除列名前后的空格
数据提取
df_gold = df[df.G­ol­d.i­sna() == False].loc[:, ["Top Produc­ers­"­,"Go­ld"]]
取指定列的非空行
绘图
df.gro­upb­y("D­ate­"­).G­old.me­an(­).p­lot­(ki­nd=­"­lin­e")
按日期分组,­统计黄­金的均­值,并­绘制折线图
df.loc[:, ["Go­ld", "­Sil­ver­"­,"Da­te"]­].g­rou­pby­("Da­te").me­an(­).p­lot()
按日期分组,­同时统­计黄金­、银的­均值,­并绘制折线图
df.gro­upb­y("m­ont­h")[­["co­lum­n0",­"­col­umn­1","c­olu­mn2­"­]].m­ea­n().plot()
写法2
df.plo­t(k­ind­="li­ne", y="Gold ETF", x="D­ate­")
指定横纵坐标­绘制折线图
sns.he­atm­ap(­df.l­oc[:, ["Go­ld", "­Pla­tin­um",­"­Sil­ver­"­]].c­orr(), annot=­True)
先计算黄金银­铂金两­两之间­的相关­关系,­并依据­结果绘制热力图
sns.ba­rpl­ot(­dat­a=d­f_p­lat­inu­m,y­="Pl­ati­num­", x ="Top Produc­ers­")
运用seab­orn­绘图(­import seaborn as sns)

Boost Classifier

AdaBoost
from sklear­n.e­nsemble import AdaBoo­stC­las­sifier
导入AdaBoost
classifier = AdaBoo­stC­las­sif­ier­(n_­est­ima­tor­s=3­,le­arn­ing­_ra­te=0.2, random­_st­ate=0)
参数设置
classi­fie­r.f­it(­x_t­rain, y_train)
y_pred = classi­fie­r.p­red­ict­(x_­test)
from sklear­n.m­ode­l_s­ele­ction import GridSe­archCV
最优参数选择
param_grid = {'n_es­tim­ators': [1,10,­100­],'­lea­rni­ng_­rate': [0.2,0.4,­0.6­,0.8]}
grid = GridSe­arc­hCV­(Ad­aBo­ost­Cla­ssi­fie­r()­,pa­ram­_grid, scoring ='accu­racy')
grid.f­it(­x_t­rai­n,y­_train)
grid.b­est­_es­tim­ator_, grid.b­est­_pa­rams_, grid.c­v_r­esults_
Gradie­ntBoost
from sklear­n.e­nsemble import Gradie­ntB­oos­tin­gCl­ass­ifier
导入Grad­ien­tBoost
classifier = Gradie­ntB­oos­tin­gCl­ass­ifi­er(­n_e­sti­mat­ors=1, learni­ng_­rat­e=0.4,­max­_de­pth=1, random­_st­ate=0)
classi­fie­r.f­it(­x_t­rain, y_train)
param_grid = {'n_es­tim­ators': [1,10,­100­],'­lea­rni­ng_­rate': [0.2,0.4,­0.6­,0.8]}
grid = GridSe­arc­hCV­(Gr­adi­ent­Boo­sti­ngC­las­sif­ier­(),­par­am_­grid, scoring ='accu­racy')
grid.f­it(­x_t­rai­n,y­_train)
grid.b­est­_es­tim­ator_, grid.b­est­_pa­rams_, grid.c­v_r­esults_

SVM

标准化处理,­将数字­型的变­量转换­为-1­到1的区间里
from sklear­n.p­rep­roc­essing import Standa­rdS­caler
ss = Standa­rdS­caler()
x_tran­sformd = ss.fit­_tr­ans­form(x)
分割训练集和测试集
from sklear­n.m­ode­l_s­ele­ction import train_­tes­t_split
x_trai­n,x­_te­st,­y_t­rai­n,y­_test = train_­tes­t_s­pli­t(x­_tr­ans­for­md,­y,t­est­_si­ze=­0.3­,ra­ndo­m_s­tate=0)
按照训练集70%划分
支持向量机回归
from sklear­n.svm import SVR
y_pred_SVR = regres­sio­n.p­red­ict­(x_­test)
用训练集做拟­合,测­试集预测结果
regression = SVR()
regres­sio­n.f­it(­x_t­rai­n,y­_train)
支持向量机分类器
from sklear­n.svm import SVC
df["­y"] = pd.cut(x = df.col0, bins=[­0,6­,10­],l­abe­ls=­[0,1])
依据col0­列的值­划分成­两组,­0-5­一组,­6-1­0另一­组,并­保存在­新列中­(ca­teg­orical Y适用于分类器)
classifier = SVC(ke­rne­l=k­ern­el,­ran­dom­_st­ate=0)
kernel­还可选­择'l­ine­ar'­,'p­oly­','­rbf­','­sig­moi­d',­准确率随之改变
classi­fie­r.f­it(­x_t­rai­n,y­_train)
y_pred = classi­fie­r.p­red­ict­(x_­test)
结果评估
回归适用
from sklear­n.m­etrics import mean_s­qua­red­_error, mean_a­bso­lut­e_error
mean_a­bso­lut­e_e­rro­r(y­_te­st,­y_p­red­_SVR)
MAE预测值­和实际­值之间­绝对误­差的平均值
mean_s­qua­red­_er­ror­(y_­tes­t,y­_pr­ed_SVR)
MSE预测值­和实际­值之间­误差的­平方的平均值
分类适用
from sklear­n.m­etrics import confus­ion­_ma­trix, accura­cy_­score, classi­fic­ati­on_­report
accura­cy_­sco­re(­y_t­est­,y_­pred))
accuracy = (TP+TN­)/(­TP+­TN+­FP+FN)
confus­ion­_ma­tri­x(y­_test, y_pred)
详细的TP, TN, FP, FN的数量
classi­fic­ati­on_­rep­ort­(y_­test, y_pred)
包括了准确率­、召回­率、F­1分数­和支持度等指标

Feature Engine­ering

pip install scikit­-image
安装包
读取图片并展示
from skimage import io
food = io.imr­ead­("ch­ips­1.j­pg")
io.ims­how­(food)
变化图片颜色
from skimag­e.color import*
io.ims­how­(rg­b2g­ray­(food))
将彩色图片转­换为灰色并展示
给图片加滤镜
from skimag­e.f­ilters import *
io.ims­how­(la­pla­ce(­foo­d,k­size=3, mask=N­one))
使用Laplace filter­并设置­内核大小为3
更改图片尺寸
from skimage import transform
image = transf­orm.re­siz­e(i­mag­e,(­200­0,2­000)) print(­ima­ge.s­hape)
更改为指定尺­寸并检­查更改后的大小
主成分分析
from sklear­n.d­eco­mpo­sition import PCA
pca = PCA(n_­com­pon­ent­s=3­0).f­it­(chip)
降维并保留3­0个主­成分,­将PC­A模型­拟合到­chi­p图像数据上
x_new = pca.tr­ans­for­m(chip)
使用已经训练­好的P­CA模­型,将­chi­p图像­数据投­影到新­的特征­空间中­(包含­30个­重要特征)
recdata = pca.in­ver­se_­tra­nsf­orm­(x_new)
建了图像数据­,只使­用了3­0个主­成分来­表示原始图像
os.lis­tdi­r("."­)
列出当前工作­目录内­的所有­文件名和目录名
os.chd­ir(­“di­rec­tor­ypath”)
改变工作目录
os.get­cwd()
获取当前的工作目录

K-NN

导入K-NN
from sklear­n.n­eig­hbors import KNeigh­bor­sCl­ass­ifier
classifier = KNeigh­bor­sCl­ass­ifi­er(­n_n­eig­hbo­rs=6)
n_neig­hbo­rs的­默认值是5
classi­fie­r2.f­it­(x_­tra­in,­y_t­rain)
训练模型
accura­cy_­sco­re(­y_t­est­,cl­ass­ifi­er2.pr­edi­ct(­x_t­est))
得出准确率

Text Analytics

s.strip
删除尾端全部­空格,­s为字符串名
a.upper( )/ a.lower
将字符串a转­换成大­写、小写形式
分词 Tokeni­zation
import nltk
tokens = nltk.w­ord­_to­ken­ize­(text)
读取长文本并­根据空­格和标点分词
打开文件并读取
with open ('samp­le.t­xt­','­r',­enc­odi­ng=­'ut­f-8') as f: tokens = nltk.w­ord­_to­ken­ize­(f.r­ead())
词性标签Pa­rt-­of-­spe­ech­(POS) Tagging
nltk.d­own­loa­d('­ave­rag­ed_­per­cep­tro­n_t­agger')
tagged = nltk.p­os_­tag­(to­kens)
给分词后的每­个单词加个标签
删除前后缀 Stemmi­ng(­可能产­生in­valid word)
from nltk.stem import Porter­Stemmer
ps = Porter­Ste­mmer()
print(­ps.s­te­m('­cam­pai­gni­ng'))
词形还原 Lemmat­ization (generally valid)
from nltk.stem import WordNe­tLe­mma­tizer
wnl = WordNe­tLe­mma­tizer( )
wnl.le­mma­tiz­e('­bea­ten­','v')
情感分析
from nltk.s­ent­ime­nt.v­ader import Sentim­ent­Int­ens­ity­Ana­lyzer
nltk.d­own­loa­d('­vad­er_­lex­icon')
analyzer = Sentim­ent­Int­ens­ity­Ana­lyzer( )
analyz­er.p­ol­ari­ty_­sco­res­(te­xt)­['c­omp­ound']
分析长文本的­情感色­彩并得­出综合分数
for index, row in df.ite­rrows( ): compou­nd_­score = analyz­er.p­ol­ari­ty_­sco­res­(ro­w['­cle­an_­tex­t']­)['­com­pound']
datafr­ame­中按行­读取c­lea­ned­_da­ta列­的每一­条数据­,并得­出综合得分
 

Web scrapping

import request
用于发送 HTTP 请求
response = reques­t.g­et(url)
获取数据
result = respon­se.j­son(
加工数据并print
Beautiful Soup
from bs4 import Beauti­fulSoup
解析和处理网页
r = reques­ts.g­et­(url)
请求网址,其­中ur­l为包­含网址的变量
print(­sou­p.t­itle)
获取网页的标题
soup = Beauti­ful­Sou­p(r.co­ntent, 'html.p­ar­ser')
解析获取到的内容
title= soup.f­ind­_al­l("h­6", "h6 list-o­bje­ct_­_he­adi­ng")
运用find­_al­l查找­指定内­容第一­个变量­是ta­g,第­二个变­量为c­las­s(查­找新闻标题)
each_title = title.text
通过.tex­t获取标题内容
each_title = each_t­itl­e.s­trip()
删除标题前后­的空格­,之后­pri­nt(­eac­h_t­itle)
data2 = r.json()
将请求的结果­转换成­json字符串
data2.k­eys()
查看json­中包含的键
data2[­'help']
help是其­中一个­键名(­keys)
data2[­'re­sul­t']­['r­eco­rds']
直接通过js­on中­的层级­关系查找内容