import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn

import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

acci = pd.read_csv('acc.csv', dtype={'LSOA_of_Accident_Location': str})

acci.head()

casu = pd.read_csv('casuality.csv')

vehi = pd.read_csv('vehicle.csv')

kclt=pd.read_csv('KCLT1.CSV')

acci.describe()

acci.head()

casu.describe()

casu.head()

vehi.describe()

What fraction of accidents occur in urban, rural and other (na) areas?

urban_acci = len(acci[acci['Urban_or_Rural_Area']==1])
rural_acci = len(acci[acci['Urban_or_Rural_Area']==2])
na_acci = len(acci[acci['Urban_or_Rural_Area']==3])
total_acci = urban_acci + rural_acci + na_acci

urban_pct = urban_acci*1.0 / total_acci * 100
rural_pct = rural_acci*1.0 / total_acci *100
na_pct = na_acci*1.0 / total_acci * 100

print("Percentage of accidents occur in urban areas is {0:.0f}%".format(urban_pct))
print("Percentage of accidents occur in rural areas is {0:.0f}%".format(rural_pct))
print("Percentage of accidents occur in other areas is {0:.0f}%".format(na_pct))

x = ['1', '2', '3']
y = [urban_pct, rural_pct, na_pct]
x_pos = list(range(len(x)))
plt.bar(x_pos, y)
plt.ylabel('Percentage of accidents')
plt.xticks(x_pos, x)
plt.title("Percentage of accidents occured by area")
plt.show()

Percentage of accidents occur in urban areas is 81%
Percentage of accidents occur in rural areas is 19%
Percentage of accidents occur in other areas is 0%

When is the most dangerous time to drive?

acci['Hour'] = acci['Time'].map(lambda x: str(x).split(':')[0])
# print(acci['Hour'].describe())

acci['Hour'] = acci['Hour'].apply(pd.to_numeric, errors='coerce')
hour = []
num_of_fatal_acci = []
num_of_acci = []
for i in range(24):
    hour.append(i)
    num_of_fatal_acci_hour = len(acci[(acci['Accident_Severity'] == 1) & (acci['Hour'] == i)])
    num_of_acci_hour = len(acci[acci['Hour'] == i])
    num_of_fatal_acci.append(num_of_fatal_acci_hour)
    num_of_acci.append(num_of_acci_hour)
print(hour)
print(num_of_fatal_acci)
print(num_of_acci)

normalized_num_of_fatal_acci = list(np.array(num_of_fatal_acci) / np.array(num_of_acci) * 100)
# print(max(normalized_num_of_fatal_acci))

fig = plt.figure(figsize=(14,8))

ax1 = fig.add_subplot(221)
ax1.plot(hour, num_of_fatal_acci)
ax1.set_ylabel('Number of fatal accidents')
ax1.set_xlabel('Hour')
ax1.grid(True)

ax2 = fig.add_subplot(222)
ax2.plot(hour, num_of_acci)
ax2.set_ylabel('Number of all accidents')
ax2.set_xlabel('Hour')
ax2.grid(True)

ax3 = fig.add_subplot(223)
ax3.plot(hour, normalized_num_of_fatal_acci)
ax3.set_ylabel('Percentage of fatal accidents in all accidents')
ax3.set_xlabel('Hour')
ax3.grid(True)

plt.show()

print("The most dangerous hour to drive, when most fatal accidents happend in all accidents, is {} o'clock".format(normalized_num_of_fatal_acci.index(max(normalized_num_of_fatal_acci))))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
[43, 39, 36, 16, 14, 15, 23, 31, 35, 31, 42, 34, 51, 44, 56, 56, 56, 64, 62, 61, 50, 53, 54, 41]
[1460, 1082, 933, 559, 399, 536, 1222, 3121, 5663, 4144, 3827, 4443, 5351, 5433, 5369, 6882, 7090, 7645, 6085, 4845, 3814, 3029, 2500, 2282]

The most dangerous hour to drive, when most fatal accidents happend in all accidents, is 2 o'clock

#gender_fatal_car = vehi[['Accident_Index','Sex_of_Driver']]
#print (len(gender_fatal_car['Accident_Index']))
temp = acci[['Accident_Index', 'Accident_Severity']]
#print (len(temp))
#gender_fatal_car=gender_fatal_car.head().merge(temp.head(), on = 'Accident_Index', how = 'left')
temp1=vehi[['Sex_of_Driver']]
#print (temp.head())
#print (temp1['Sex_of_Driver'].head())
x=pd.concat([temp,temp1],axis=1)
m = len(x[(x['Sex_of_Driver'] == 1) & (x['Accident_Severity'] == 1)])
f = len(x[(x['Sex_of_Driver'] == 2) & (x['Accident_Severity'] == 1)])
print(m, f)

680 276

What is the trend in the number of accidents that occur each year?

acci['Year'] = acci['Accident_Index'].map(lambda x: str(x)[:4])
acci['Year'] = acci['Year'].apply(pd.to_numeric, errors='coerce')
# print(acci['Year'].head())

year = []
num_of_acci_year = []
for i in range(2005, 2015):
    year.append(i)
    num_of_acci_year.append(len(acci[acci['Year'] == i]))
# print(year)
# print(num_of_acci_year)

plt.plot(year, num_of_acci_year)
plt.xlabel('Year')
plt.ylabel('Number of accidents')
plt.title('Correlation between number of accidents and year')
plt.grid(True)
plt.show()

# slope, intercept = np.polyfit(year, num_of_acci_year, 1)
# print("{:.1f}".format(slope))
# print(intercept)

Do accidents in high-speed-limit areas have more casualties?

# print(set(acci['Speed_limit']))
speed_limit = []
num_casualty = []
num_acci = []
ratio = []

for i in sorted(list(set(acci['Speed_limit']))):
    speed_limit.append(i)
    casualty = acci.loc[acci['Speed_limit'] == i, 'Number_of_Casualties'].sum()
    num_casualty.append(casualty)
    accident = len(acci[(acci['Speed_limit'] == i)])
    num_acci.append(accident)
    r = casualty / accident
    ratio.append(r)
# print(speed_limit)
# print(num_casualty)
# print(num_acci)
# print(ratio)

plt.plot(speed_limit, ratio)
plt.xlabel('Speed limit')
plt.ylabel('Casualty per accident, average')
plt.title('Correlation between casualty per accident and speed limit')
plt.grid(True)
plt.show()

<ipython-input-19-248ccd0298b6>:13: RuntimeWarning: invalid value encountered in longlong_scalars
  r = casualty / accident

How fast do the number of car accidents drop off with age? (Only consider car drivers who are legally allowed to drive in the UK: 17 years or older)

age_acci = vehi[['Accident_Index', 'Age_of_Driver', 'Vehicle_Type']]
# print(age_acci.head())
# print(max(age_acci['Age_of_Driver']))
age = []
num_of_acci = []
for i in range(17, max(age_acci['Age_of_Driver'])+1):
    age.append(i)
    num_of_acci.append(len(age_acci[(age_acci['Age_of_Driver'] == i) & (age_acci['Vehicle_Type'] == 9)]))
# print(age)
# print(num_of_acci)

plt.plot(age, num_of_acci, label = 'Data', marker = 'o')
plt.xlabel('Age')
plt.ylabel('Number of car accidents')
plt.title('Correlation between driver age and number of car accidents')
plt.grid(True)
plt.show()

accidents=acci['Accident_Index'][1:25]
LightConditions=acci['Light_Conditions'][1:25]

fig = plt.figure(figsize=(10,8))
plt.plot(LightConditions,accidents)
plt.xlabel('Light_Conditions')
plt.ylabel('Accidents')
plt.title('no of accidents occurred as per light conditions')
plt.grid(True)
plt.show()

	Accident_Index	Police_Force	Accident_Severity	Number_of_Vehicles	Number_of_Casualties	Date	Day_of_Week	Time	Local_Authority_(District)	Local_Authority_(Highway)	...	Pedestrian_Crossing-Physical_Facilities	Light_Conditions	Weather_Conditions	Road_Surface_Conditions	Urban_or_Rural_Area	Did_Police_Officer_Attend_Scene_of_Accident	LSOA_of_Accident_Location
0	200501BS00001	1	2	1	1	4/1/2005	3.0	17:42	12.0	E09000020	...	1.0	1.0	2.0	2.0	1.0	1.0	E01002849
1	200501BS00002	1	3	1	1	5/1/2005	4.0	17:36	12.0	E09000020	...	5.0	4.0	1.0	1.0	1.0	1.0	E01002909
2	200501BS00003	1	3	2	1	6/1/2005	5.0	0:15	12.0	E09000020	...	0.0	4.0	1.0	1.0	1.0	1.0	E01002857
3	200501BS00004	1	3	1	1	7/1/2005	6.0	10:35	12.0	E09000020	...	0.0	1.0	1.0	1.0	1.0	1.0	E01002840
4	200501BS00005	1	3	1	1	10/1/2005	2.0	21:13	12.0	E09000020	...	0.0	7.0	1.0	2.0	1.0	1.0	E01002863

	Accident_Index	Police_Force	Accident_Severity	Number_of_Vehicles	Number_of_Casualties	Date	Day_of_Week	Time	Local_Authority_(District)	Local_Authority_(Highway)	...	Pedestrian_Crossing-Physical_Facilities	Light_Conditions	Weather_Conditions	Road_Surface_Conditions	Urban_or_Rural_Area	Did_Police_Officer_Attend_Scene_of_Accident	LSOA_of_Accident_Location
0	200501BS00001	1	2	1	1	4/1/2005	3.0	17:42	12.0	E09000020	...	1.0	1.0	2.0	2.0	1.0	1.0	E01002849
1	200501BS00002	1	3	1	1	5/1/2005	4.0	17:36	12.0	E09000020	...	5.0	4.0	1.0	1.0	1.0	1.0	E01002909
2	200501BS00003	1	3	2	1	6/1/2005	5.0	0:15	12.0	E09000020	...	0.0	4.0	1.0	1.0	1.0	1.0	E01002857
3	200501BS00004	1	3	1	1	7/1/2005	6.0	10:35	12.0	E09000020	...	0.0	1.0	1.0	1.0	1.0	1.0	E01002840
4	200501BS00005	1	3	1	1	10/1/2005	2.0	21:13	12.0	E09000020	...	0.0	7.0	1.0	2.0	1.0	1.0	E01002863

	Vehicle_Reference	Casualty_Reference	Casualty_Class	Sex_of_Casualty	Age_of_Casualty	Age_Band_of_Casualty	Casualty_Severity	Pedestrian_Location	Pedestrian_Movement	Car_Passenger	Bus_or_Coach_Passenger	Pedestrian_Road_Maintenance_Worker	Casualty_Type	Casualty_Home_Area_Type
count	713000.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.000000	712999.0	712999.000000	712999.000000
mean	1.478285	1.437173	1.496647	1.415758	33.241136	5.871056	2.867031	0.654889	0.456155	0.293881	0.090340	-1.0	7.635827	0.889673
std	0.619397	1.101567	0.701956	0.495390	18.632343	2.431920	0.372657	1.945577	1.620012	0.600751	0.565519	0.0	6.679438	1.084864
min	1.000000	1.000000	1.000000	-1.000000	-1.000000	-1.000000	1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.0	0.000000	-1.000000
25%	1.000000	1.000000	1.000000	1.000000	19.000000	4.000000	3.000000	0.000000	0.000000	0.000000	0.000000	-1.0	5.000000	1.000000
50%	1.000000	1.000000	1.000000	1.000000	30.000000	6.000000	3.000000	0.000000	0.000000	0.000000	0.000000	-1.0	9.000000	1.000000
75%	2.000000	2.000000	2.000000	2.000000	45.000000	7.000000	3.000000	0.000000	0.000000	0.000000	0.000000	-1.0	9.000000	1.000000
max	19.000000	68.000000	3.000000	2.000000	99.000000	11.000000	3.000000	10.000000	9.000000	2.000000	4.000000	-1.0	90.000000	3.000000

	Accident_Index	Vehicle_Reference	Casualty_Reference	Casualty_Class	Sex_of_Casualty	Age_of_Casualty	Age_Band_of_Casualty	Casualty_Severity	Pedestrian_Location	Pedestrian_Movement	Bus_or_Coach_Passenger	Pedestrian_Road_Maintenance_Worker	Casualty_Type	Casualty_Home_Area_Type
0	200501BS00001	1	1.0	3.0	1.0	56.0	7.0	2.0	1.0	1.0	0.0	-1.0	0.0	1.0
1	200501BS00002	1	1.0	2.0	1.0	47.0	7.0	3.0	0.0	0.0	4.0	-1.0	11.0	1.0
2	200501BS00003	2	1.0	1.0	1.0	62.0	9.0	3.0	0.0	0.0	0.0	-1.0	9.0	1.0
3	200501BS00004	1	1.0	3.0	2.0	30.0	6.0	3.0	5.0	2.0	0.0	-1.0	0.0	1.0
4	200501BS00005	1	1.0	1.0	1.0	49.0	8.0	3.0	0.0	0.0	0.0	-1.0	3.0	-1.0

	Vehicle_Reference	Vehicle_Type	Towing_and_Articulation	Vehicle_Manoeuvre	Vehicle_Location-Restricted_Lane	Junction_Location	Skidding_and_Overturning	Hit_Object_in_Carriageway	Vehicle_Leaving_Carriageway	Hit_Object_off_Carriageway	...	Was_Vehicle_Left_Hand_Drive?	Journey_Purpose_of_Driver	Sex_of_Driver	Age_of_Driver	Age_Band_of_Driver	Engine_Capacity_(CC)	Propulsion_Code	Age_of_Vehicle	Driver_IMD_Decile	Driver_Home_Area_Type
count	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	...	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000	866223.000000
mean	1.559869	9.667678	0.032816	12.772120	0.123591	2.411406	0.226169	0.309478	0.378220	0.571697	...	0.951786	10.730304	1.386455	33.147417	5.695801	1339.853980	0.622576	4.089737	3.391802	0.795520
std	0.741835	7.631000	0.312365	6.155444	0.953635	3.100804	0.721344	1.623085	1.400388	2.099478	...	0.328831	6.420805	0.591698	19.068999	2.933082	1714.813295	1.133182	5.023302	3.714446	1.133638
min	1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	...	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000
25%	1.000000	9.000000	0.000000	7.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	2.000000	1.000000	21.000000	5.000000	-1.000000	-1.000000	-1.000000	-1.000000	1.000000
50%	1.000000	9.000000	0.000000	17.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	15.000000	1.000000	33.000000	6.000000	1348.000000	1.000000	3.000000	3.000000	1.000000
75%	2.000000	9.000000	0.000000	18.000000	0.000000	5.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	15.000000	2.000000	45.000000	7.000000	1799.000000	1.000000	8.000000	7.000000	1.000000
max	22.000000	90.000000	5.000000	18.000000	9.000000	8.000000	5.000000	12.000000	8.000000	10.000000	...	2.000000	15.000000	3.000000	99.000000	11.000000	99999.000000	10.000000	87.000000	10.000000	3.000000