Road Accident Analysis
Exploratory Data Analysis to gather insights over past Road Accidents data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
acci = pd.read_csv('acc.csv', dtype={'LSOA_of_Accident_Location': str})
acci.head()
casu = pd.read_csv('casuality.csv')
vehi = pd.read_csv('vehicle.csv')
kclt=pd.read_csv('KCLT1.CSV')
acci.describe()
acci.head()
casu.describe()
casu.head()
vehi.describe()
What fraction of accidents occur in urban, rural and other (na) areas?
urban_acci = len(acci[acci['Urban_or_Rural_Area']==1])
rural_acci = len(acci[acci['Urban_or_Rural_Area']==2])
na_acci = len(acci[acci['Urban_or_Rural_Area']==3])
total_acci = urban_acci + rural_acci + na_acci
urban_pct = urban_acci*1.0 / total_acci * 100
rural_pct = rural_acci*1.0 / total_acci *100
na_pct = na_acci*1.0 / total_acci * 100
print("Percentage of accidents occur in urban areas is {0:.0f}%".format(urban_pct))
print("Percentage of accidents occur in rural areas is {0:.0f}%".format(rural_pct))
print("Percentage of accidents occur in other areas is {0:.0f}%".format(na_pct))
x = ['1', '2', '3']
y = [urban_pct, rural_pct, na_pct]
x_pos = list(range(len(x)))
plt.bar(x_pos, y)
plt.ylabel('Percentage of accidents')
plt.xticks(x_pos, x)
plt.title("Percentage of accidents occured by area")
plt.show()
When is the most dangerous time to drive?
acci['Hour'] = acci['Time'].map(lambda x: str(x).split(':')[0])
# print(acci['Hour'].describe())
acci['Hour'] = acci['Hour'].apply(pd.to_numeric, errors='coerce')
hour = []
num_of_fatal_acci = []
num_of_acci = []
for i in range(24):
hour.append(i)
num_of_fatal_acci_hour = len(acci[(acci['Accident_Severity'] == 1) & (acci['Hour'] == i)])
num_of_acci_hour = len(acci[acci['Hour'] == i])
num_of_fatal_acci.append(num_of_fatal_acci_hour)
num_of_acci.append(num_of_acci_hour)
print(hour)
print(num_of_fatal_acci)
print(num_of_acci)
normalized_num_of_fatal_acci = list(np.array(num_of_fatal_acci) / np.array(num_of_acci) * 100)
# print(max(normalized_num_of_fatal_acci))
fig = plt.figure(figsize=(14,8))
ax1 = fig.add_subplot(221)
ax1.plot(hour, num_of_fatal_acci)
ax1.set_ylabel('Number of fatal accidents')
ax1.set_xlabel('Hour')
ax1.grid(True)
ax2 = fig.add_subplot(222)
ax2.plot(hour, num_of_acci)
ax2.set_ylabel('Number of all accidents')
ax2.set_xlabel('Hour')
ax2.grid(True)
ax3 = fig.add_subplot(223)
ax3.plot(hour, normalized_num_of_fatal_acci)
ax3.set_ylabel('Percentage of fatal accidents in all accidents')
ax3.set_xlabel('Hour')
ax3.grid(True)
plt.show()
print("The most dangerous hour to drive, when most fatal accidents happend in all accidents, is {} o'clock".format(normalized_num_of_fatal_acci.index(max(normalized_num_of_fatal_acci))))
#gender_fatal_car = vehi[['Accident_Index','Sex_of_Driver']]
#print (len(gender_fatal_car['Accident_Index']))
temp = acci[['Accident_Index', 'Accident_Severity']]
#print (len(temp))
#gender_fatal_car=gender_fatal_car.head().merge(temp.head(), on = 'Accident_Index', how = 'left')
temp1=vehi[['Sex_of_Driver']]
#print (temp.head())
#print (temp1['Sex_of_Driver'].head())
x=pd.concat([temp,temp1],axis=1)
m = len(x[(x['Sex_of_Driver'] == 1) & (x['Accident_Severity'] == 1)])
f = len(x[(x['Sex_of_Driver'] == 2) & (x['Accident_Severity'] == 1)])
print(m, f)
What is the trend in the number of accidents that occur each year?
acci['Year'] = acci['Accident_Index'].map(lambda x: str(x)[:4])
acci['Year'] = acci['Year'].apply(pd.to_numeric, errors='coerce')
# print(acci['Year'].head())
year = []
num_of_acci_year = []
for i in range(2005, 2015):
year.append(i)
num_of_acci_year.append(len(acci[acci['Year'] == i]))
# print(year)
# print(num_of_acci_year)
plt.plot(year, num_of_acci_year)
plt.xlabel('Year')
plt.ylabel('Number of accidents')
plt.title('Correlation between number of accidents and year')
plt.grid(True)
plt.show()
# slope, intercept = np.polyfit(year, num_of_acci_year, 1)
# print("{:.1f}".format(slope))
# print(intercept)
Do accidents in high-speed-limit areas have more casualties?
# print(set(acci['Speed_limit']))
speed_limit = []
num_casualty = []
num_acci = []
ratio = []
for i in sorted(list(set(acci['Speed_limit']))):
speed_limit.append(i)
casualty = acci.loc[acci['Speed_limit'] == i, 'Number_of_Casualties'].sum()
num_casualty.append(casualty)
accident = len(acci[(acci['Speed_limit'] == i)])
num_acci.append(accident)
r = casualty / accident
ratio.append(r)
# print(speed_limit)
# print(num_casualty)
# print(num_acci)
# print(ratio)
plt.plot(speed_limit, ratio)
plt.xlabel('Speed limit')
plt.ylabel('Casualty per accident, average')
plt.title('Correlation between casualty per accident and speed limit')
plt.grid(True)
plt.show()
How fast do the number of car accidents drop off with age? (Only consider car drivers who are legally allowed to drive in the UK: 17 years or older)
age_acci = vehi[['Accident_Index', 'Age_of_Driver', 'Vehicle_Type']]
# print(age_acci.head())
# print(max(age_acci['Age_of_Driver']))
age = []
num_of_acci = []
for i in range(17, max(age_acci['Age_of_Driver'])+1):
age.append(i)
num_of_acci.append(len(age_acci[(age_acci['Age_of_Driver'] == i) & (age_acci['Vehicle_Type'] == 9)]))
# print(age)
# print(num_of_acci)
plt.plot(age, num_of_acci, label = 'Data', marker = 'o')
plt.xlabel('Age')
plt.ylabel('Number of car accidents')
plt.title('Correlation between driver age and number of car accidents')
plt.grid(True)
plt.show()
accidents=acci['Accident_Index'][1:25]
LightConditions=acci['Light_Conditions'][1:25]
fig = plt.figure(figsize=(10,8))
plt.plot(LightConditions,accidents)
plt.xlabel('Light_Conditions')
plt.ylabel('Accidents')
plt.title('no of accidents occurred as per light conditions')
plt.grid(True)
plt.show()