Please enable JavaScript.
Coggle requires JavaScript to display documents.
Data Science & Machine Learning - Coggle Diagram
Data Science & Machine Learning
Python
String
'the solution is {} for {}'.format(var1,var2)
'the solution is {one} for {two}'.format(one=var1,two=var2)
.lower
.upper
.split
List
.append
.pop(index)
slice
nest
mutable
Dictionnary
{key1:value1,key2:value2,....}
.keys
.values
.items
Tuple
(a,b,c,d...)
immutable
unpacking
for a,b in tuple
Set
{a,b,c,d}
Unique
Elements
set(list)
.add
Comparison
sup >
inf <
==
!=
Boolean
Logic
and
or
if:
elif:
else:
for item in seq:
while <condition> :
list(range(start_ind,end_ind,step))
[num**2 for num in seq]
function
def name_function(param1 ="default",...):
return
lambda input_var : output
list(map(function,sequence))
list( filter ( lambda num : num%2 == 0,seq ) )
Numpy
conda install numpy
pip install numpy
import numpy as np
Array
.array(list1D)
.arange(start,stop,step)
.linspace(start,end,nb_points)
slice
Matrix
.array(list2D)
.eye(dimension)
.zeros((nb_items1,nb_items2))
ones((nb_items1,nb_items2))
Boolean comparison
arr [ arr > 5 ]
Random
.random.rand(dim1,dim2)
0 to 1
Uniform
.random.randn(dim1,dim2)
0 to 1
Normal
.random.randint(low,high,size)
Attributes
.reshape(new_dim1,new_dim2)
.shape
Min/Max
.max()
.min()
value
.argmax()
.argmin()
index
.dtype
Broadcast
arr[0:5] = 99
arr.copy()
Operations
+*/-
Items by items
np.sqrt(arr)
np.exp(arr)
np.sin(arr)
np.log(arr)
Pandas
pip install pandas
conda install pandas
import pandas as pd
pd.Series(data = my_data, index = labels)
pd.Series(<dictionnary_name>)
DataFrames
pd.DataFrame(Data,Index,Column)
df[column_value]
Series
Sharing
Same
Index
df.drop(column_name,axis=1,inplace=True)
df.shape
df.info()
df.describe()
df.loc[index_name]
df.iloc[index_location]
df.set_index ( column_name )
df.reset_index()
Comparison
df [ df > 0]
df [ df [ column_name] > 0 ]
df [ condition1 & condition1 ]
MultiIndex
pd.MultiIndex.from_tuples
df.index.names
df.xs( 1 , level = 'num' )
Missing
np.nan
df.dropna(axis=1)
df.dropna(thresh = 2)
df.fillna ( value = <name_value> )
df.fillna ( value = df [ column_name ].mean() )
GroupBy
df.groupby ( column_name )
.describe()
.describe().transpose()
Merge & Join
pd.concat( [df1 , df2 , df3] , axis = 0)
pd.merge( df1, df2 , how = "inner" , on = 'key' )
left.join(right)
Same index
Operations
Unique
df [ 'col2' ] . unique()
List
nunique()
df [ 'col2' ] . nunique()
Count
df [ 'col2' ] . value_counts()
Occurence
List
df [ 'col1' ] . apply ( function_name )
df . drop ( 'col1' , axis = 1 , inplace = True)
df.columns
df.index
df.isnull()
df . sort_values (by = 'col2' )
df . pivot_table( values = 'D" , index = ['A','B'] , columns = ['C'] )
Data Input & Output
pd.read_csv ( 'file_name.csv' )
df.to_csv ( 'my_output' , in dex = False)
pd.read_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
pd.read_html('url_link.html')
Functions
pct.change()
Variance t/t-1
idxmin()
Occurence
1st index
idxmax()
.ix[index1:index2]
.rolling(window=30).mean()
.to_datetime(<date to convert>)
df['year'] = df['date'].apply(lambda date : date.year)
.drop('ID',axis=1)
Matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(x , y , <color> )
plt.xlabel(<title>)
plt.title(<titile>)
plt.show()
subplot(nr,nc,number)
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes.plot(x, y, 'b')
axes.set_xlabel('Set X Label')
fig,axes = plt.subplots(1,2)
axes[0]
figsize=(1,2)
dpi=100
fig.savefig('my_name.png')
ax.plot(x,y,label ="label1')
ax.legend(loc=0)
plt.tight_layout
color = 'blue'
linewidth = 5
alpha = 0.5
linestyle='--'
marker='o'
markersize=3
markerfacecolor = 'yellow'
markeredgewidth = 3
markeredgecolor = "blue"
Seaborn
http://seaborn.pydata.org/
import seaborn as sns
%matplotlib inline
tips=sns.load_dataset(<name>)
Distribution
sns.jointplot(x=<field1>,y=<field2>,data=tips,kind='reg')
sns.pairplot(tips,hue=<field1>)
All combinations
sns.distplot(tips['field'],kde=False,bins=20)
sns.rugplot(tips['field'])
Categorical
barplot(x=<field1>,y=<field2>,data=tips)
countplot(x=<field>,data=tips)
boxplot(x=<field1>,y=<field2>,data=tips,hue=<field3>)
Moustache
violinplot(x=<field1>,y=<field2>,data=tips,split=True)
stripplot(x="day", y="total_bill", data=tips,jitter=True)
Scattered
swarmplot(x="day", y="total_bill", data=tips)
Violin+Strip
factorplot(x='sex',y='total_bill',data=tips,kind='bar')
General
Matrix
sns.heatmap(tc,annot=True)
flights.pivot_table(values='passengers',index='month',columns='year')
sns.clustermap(fp)
Grids
sns.PairGrid(iris)
g.map_diag(plt.hist)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
sns.FacetGrid(tips, col="time", row="smoker")
g.map(plt.hist, "total_bill")
g.map(plt.scatter, "total_bill", "tip").add_legend()
Regression
sns.lmplot(x='total_bill',y='tip',data=tips,col='sex")
Style
sns.set_style('ticks')
sns.despine(left=True)
plt.figure(figsize=(12,3))
sns.set_context('poster',font_scale=4)
Other Viz
Pandas
df1['A'].plot
.area(alpha=0.4)
.bar()
.bar(stacked=True)
.line(x=<field1>,y=<field2>)
.scatter(x=<field1>,y=<field2>,c=<field3>)
.box()
.hexbin(x=<field1>,y=<field2>,c=<field3>),cmap='coolwarm')
.hist()
plt.style.use('ggplot')
df3.iloc[0:30].plot.area(alpha=0.4)
Plotly & Cufflinks
pip install plotly
pip install cufflinks
df.iplot(kind='scatter',x='A',y='B',mode='markers',size=10)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
https://images.plot.ly/plotly-documentation/images/python_cheat_sheet.pdf
https://plotly.com/python/reference/#choropleth
Geographical
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
data = dict( ....)
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap)
Learning
Supervised
Set
Training
Validation
Test
Performance
Classification
Accuracy
Correct
By
Total
Unbalanced :warning:
Recall
True Positive
By
True Positive + False Engative
Precision
True Positive
By
True + False Positive
F1 Score
2 * (Prec X Recall) /(Prec + Recall)
Regression
Error
Mean
Absolute
Square
Root Square
Python
Scikit
Linear
import
from
sklearn.model_selection import train_test_split
pandas as pd
numpy as np
matplotlib.pyplot as plt
seaborn as sns
from
sklearn.linear_model import LinearRegression
sklearn import metrics
X_Train,X_Test,y_Train,y_Test
train_test_split(X,y,test_size=0.3)
lm = LinearRegression()
lm.fit(X_Train,y_Train)
predictions = lm.predict(X_Test)
metrics.mean_absolute_error(y_Test, predictions)
Logistics
Theory
Sigmoid
1 / (1 + e^(-Z))
0 to 1
sns.heatmap( train.isnull() , yticklabels = Falses , cbar = False)
Visualize
Null
sns.countplot ( x = 'Survived' , hue = 'Sex', data = train)
Null
train['Age'] = train [ ['Age','PClass'] ] . apply ( impute_age, axis = 1)
train . drop ( 'Cabin' , axis = 1 , inplace = True)
train.dropna ( inplace = True )
Dummy
pd.get_dummies ( train['Sex'] , drop_first = True )
train = pd.concat( [train,sex,embark] , axis = 1]
train_test_split
from sklearn.linear_model import LogisticRegression
Logmodel = LogisticRegression()
Logmodel.fit(X,y)
predictions = Logmodel.predict (X_Test)
from sklearn.metrics import classification_report
KNN
Nearest
Neighbours
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
scaled_features = scaler.transform(df)
df_feat = pd.DataFrame (scaled_features , columns = df.columns[:-1])
test_train_split
from sklearn.neigbhors import KNeigborsClassifier
knn = KNeighborsClassifier(n_neigbhbors = 1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
Decision Trees
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_trainy_train)
predictions = dtree.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)
Graphviz
Support Vector Machine
Hyperplan
Separation
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,,y_train)
predictions = model.predict(X_test)
from sklearn.model_selection import GridSearchCV
param_grid = { 'C' :[0.1, 1, 10, 10, 1000],'gamma' : [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, verbose = 3)
grid.fit(X_train,y_train)
grid.best_params
grid.best_estimation
K Clustering
Unsupervised
Segmentation
Elbow method
from sklearn.datasets import make_blobs
data = make_blobs(n_samples=200,n_features=2, centers = 4, cluster_std = 1.8, random_state=101)
from sklearn.cluster import KMeans
kmeans =KMeans(n_cluster=4)
kmeans.fit(X)
kmeans.labels_
kmeans.cluster.centers_
Principal Component Analysis
Unsupervised
Variance
Components
from sklearn_decompsition import PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
pca.components_
df
components = pd.DataFrame(pca.components
,columns = <list Name>)
Recommender
Type
Content-based
Product
Attributes
Collaborative Filtering (CF)
User
Knowledge
Sub-type
Memory
Model
df = pd.merge(df,,movie_titles,on='item_id)
moviemat = df.pivot_table(index='user_id', columns='title', values = 'rating')
moviemat.corwith(starwars_user_ratings)
NPL
Bag of Works
Features Vector
Cosinus
Similarity
TF(D,t)
Term Frequency
d = Document
t = Term
pip install nltk
import nltk
nltk.download_shell()
messages = [line.rstrip() for line in opn('smsspamcollection/SMSSpamCollection')
messages = pd.read_csv('smsspamcollection/SMSSpamCollection',sep='\t', names =['label','message'])
messages.hist(column='length',by='label')
import string
string.punctuaton
from nltk.corpus import stopwords
stopwords.words('english')
Normalization
Remove
Punctuation
Stopwords
''.join([c for c in mess if c not in string.punctuation])
Vectorization
Term Frequency
Weight count
Normalize
Unit Lenght
Step-by-Step
from sklearn.feature_extraction.text import CountVectorizer
bow_tranformer =CountVectorizer(analyzer=text_process).fit(messages['message'])
from sklearn.feature_extraction.text import tfidfTransformer
TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)
from sklearn.naive_bayes impot MultinomilaNB
spam_detect_model = MultinomialNB().fit(messages_tfidf,messages['label'])
Pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline( [('bow',CountVectorizer(analyszer=text_process)),('tfidf',TfidfTransformer()),('classifier',MultinomialNB())])
pipeline.fit(msg_train,label_train)
DeepLearning
Theory
Perceptron
Activation
Step
Sigmoid (0-1)
Hyperbolic Tang (-1/+1)
Relu
Max(0,z)
Softmax
Exclusive
Cost Function :heavy_dollar_sign:
Quadratic
Gradient Descent
Learning Rate
Adam
BackPropagation
pip install tensorflow
Tools
Tensorflow
Open Source
Google
Keras
High Level Python
Library
On top of
Tensorflow
Theano
Preparation
numpy .values
train_test_split
Normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
Modelization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
Creation
model = Sequential()
model.add(Dense(4,activation='relu'))
model = Sequential([Dense(4, activation='relu'),Dense(2,.....)]
model.compile(optimizer ='smsprop', loss ='mse')
model.fit(X_train,y_train,batchsize = 128, epochs = 400, validation_data=(X_test,y_train))
model.history.history
Evaluation
model.evaluate(X_test,y_test)
model.predict(X_test)
from tensorflow.keras.models import load_model
model.save('my_model.h5')
load_model('my_model.h5')
Classification
Dropout
Random Turn-off
from tensorflow.keras.layers import Dropout
model.add(Dropout(0.5))
Early Stop
Epochs
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor ='val_loss' , mode = 'min',verbose = 1, patience = 25)
model.fit(.....,callbacks =[early_stop])
output activation
sigmoid
loss = binary_cross_entropy
Big Data
Hadoop
HDFS
128MB block
Datasets
MapReduce
Split
Tasks
Compute
On
HDFS
Spark
Apache
MapReduce Like
Cassandra + HDFS + S3 + ....
Data
In Memory
Resilient Distributed Dataset (RDD)
Transformation
Filter
Map
flatMap
Actions
Collect
Count
First
Take