Descriptive Statistics
Descriptive Stats with Data and Distributions
Random Variables
A Random variable is a variable that takes on numerical values as a result of random expriments of mesuarement, associates a numerical value with each possible outcom .. R.V's must have numberical values.
-
Discrete Random Variable: has a finite number of values or an infinate sequence of values(0,2,3,...) AND the differnces between coutcomes are menaningful
- Die throw can have 1,2,3,4,6 and each is a meaningfully different.
-
Continuous Random Variable: ahs a nearly infinite numbers of outcomes that cannot be easiily counted AND the differences between teh outcomes are NOT meaningful
- With average income, the difference between $40,00 and $40,001 is not meaningful
Discrete Probability Distribution
The probability distribution for a random variable X describes how assigned to each outcome for the random variable. Let 0=Heads and 1=Tails for a coin flip. so ouir discrete random variable x is described as: x = 0,1
- The Probability for each outcome is described by a discrete porbability funcion denoted by P(x) $$\sum P(x) = 1$$ sum of all RV Probabilities P(x) must equal 1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform
#Generating Uniform random numbers
from scipy.stats import uniform
# We can generate random variables/numbers
# from uniform distribution from uniform distribution’s rvs function like uniform.rvs.
# Here we are generating 1000 Random Variables (rvs) between 0 and 10.
data_uniform = uniform.rvs(size=1000, loc=0, scale=10)
len(data_uniform)
data_uniform[:20]
ax = sns.distplot(data_uniform,
bins=100,
kde=True,
color='skyblue',
hist_kws={"linewidth": 15,'alpha':1})
ax.set(xlabel='Uniform ', ylabel='Frequency')
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline
dset = pd.DataFrame([[1],[2],[2],[3],[3],[3],[4],[4],[5]])
print("Mean: ",dset.mean()[0])
print("Standard deviation", dset.std()[0])
density_plot(np.array(dset))
# Applying Standardization.
std_sc = StandardScaler()
zscore_data = std_sc.fit_transform(dset)
print("Mean: ",zscore_data.mean())
print("Standard deviation", zscore_data.std())
def density_plot(ds):
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw= {"height_ratios": (0.2, 1)})
mean = np.mean(ds)
median = np.median(ds)
mode = stats.mode(ds)[0]
sns.boxplot(ds, ax=ax_box)
ax_box.axvline(mean, color='r', linestyle='--')
ax_box.axvline(median, color='g', linestyle='-')
ax_box.axvline(mode, color='b', linestyle='-')
sns.distplot(ds, hist=True, kde=True, ax=ax_hist,
color = 'darkblue',
hist_kws={'edgecolor':'black'},
kde_kws={'linewidth': 4})
ax_hist.axvline(mean, color='r', linestyle='--')
ax_hist.axvline(median, color='g', linestyle='-')
ax_hist.axvline(mode, color='b', linestyle='-')
plt.legend({'Mean':mean,'Median':median,'Mode':mode})
ax_box.set(xlabel='')
plt.show()
density_plot(zscore_data)
type(zscore_data)
np.median(zscore_data)
stats.mode(dset)[0]
l=[1,2,4]
dic = {1:"a", 2:"b", 3:"4"}
dic.items()
for key,val in dic.items():
print(key)
lst = np.array([
[3, 2, 4],
[6, 7, 8],
[1, 4, 1]])
two = np.array([
[1],
[4],
[5]
])
lst*two
lst@two