import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
acci = pd.read_csv('acc.csv', dtype={'LSOA_of_Accident_Location': str})
acci.head()
Accident_Index Police_Force Accident_Severity Number_of_Vehicles Number_of_Casualties Date Day_of_Week Time Local_Authority_(District) Local_Authority_(Highway) ... Pedestrian_Crossing-Human_Control Pedestrian_Crossing-Physical_Facilities Light_Conditions Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
0 200501BS00001 1 2 1 1 4/1/2005 3.0 17:42 12.0 E09000020 ... 0.0 1.0 1.0 2.0 2.0 0.0 0.0 1.0 1.0 E01002849
1 200501BS00002 1 3 1 1 5/1/2005 4.0 17:36 12.0 E09000020 ... 0.0 5.0 4.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002909
2 200501BS00003 1 3 2 1 6/1/2005 5.0 0:15 12.0 E09000020 ... 0.0 0.0 4.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002857
3 200501BS00004 1 3 1 1 7/1/2005 6.0 10:35 12.0 E09000020 ... 0.0 0.0 1.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002840
4 200501BS00005 1 3 1 1 10/1/2005 2.0 21:13 12.0 E09000020 ... 0.0 0.0 7.0 1.0 2.0 0.0 0.0 1.0 1.0 E01002863

5 rows × 28 columns

casu = pd.read_csv('casuality.csv')
vehi = pd.read_csv('vehicle.csv')
kclt=pd.read_csv('KCLT1.CSV')
acci.describe()
acci.head()
Accident_Index Police_Force Accident_Severity Number_of_Vehicles Number_of_Casualties Date Day_of_Week Time Local_Authority_(District) Local_Authority_(Highway) ... Pedestrian_Crossing-Human_Control Pedestrian_Crossing-Physical_Facilities Light_Conditions Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
0 200501BS00001 1 2 1 1 4/1/2005 3.0 17:42 12.0 E09000020 ... 0.0 1.0 1.0 2.0 2.0 0.0 0.0 1.0 1.0 E01002849
1 200501BS00002 1 3 1 1 5/1/2005 4.0 17:36 12.0 E09000020 ... 0.0 5.0 4.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002909
2 200501BS00003 1 3 2 1 6/1/2005 5.0 0:15 12.0 E09000020 ... 0.0 0.0 4.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002857
3 200501BS00004 1 3 1 1 7/1/2005 6.0 10:35 12.0 E09000020 ... 0.0 0.0 1.0 1.0 1.0 0.0 0.0 1.0 1.0 E01002840
4 200501BS00005 1 3 1 1 10/1/2005 2.0 21:13 12.0 E09000020 ... 0.0 0.0 7.0 1.0 2.0 0.0 0.0 1.0 1.0 E01002863

5 rows × 28 columns

casu.describe()
Vehicle_Reference Casualty_Reference Casualty_Class Sex_of_Casualty Age_of_Casualty Age_Band_of_Casualty Casualty_Severity Pedestrian_Location Pedestrian_Movement Car_Passenger Bus_or_Coach_Passenger Pedestrian_Road_Maintenance_Worker Casualty_Type Casualty_Home_Area_Type
count 713000.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.000000 712999.0 712999.000000 712999.000000
mean 1.478285 1.437173 1.496647 1.415758 33.241136 5.871056 2.867031 0.654889 0.456155 0.293881 0.090340 -1.0 7.635827 0.889673
std 0.619397 1.101567 0.701956 0.495390 18.632343 2.431920 0.372657 1.945577 1.620012 0.600751 0.565519 0.0 6.679438 1.084864
min 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.0 0.000000 -1.000000
25% 1.000000 1.000000 1.000000 1.000000 19.000000 4.000000 3.000000 0.000000 0.000000 0.000000 0.000000 -1.0 5.000000 1.000000
50% 1.000000 1.000000 1.000000 1.000000 30.000000 6.000000 3.000000 0.000000 0.000000 0.000000 0.000000 -1.0 9.000000 1.000000
75% 2.000000 2.000000 2.000000 2.000000 45.000000 7.000000 3.000000 0.000000 0.000000 0.000000 0.000000 -1.0 9.000000 1.000000
max 19.000000 68.000000 3.000000 2.000000 99.000000 11.000000 3.000000 10.000000 9.000000 2.000000 4.000000 -1.0 90.000000 3.000000
casu.head()
Accident_Index Vehicle_Reference Casualty_Reference Casualty_Class Sex_of_Casualty Age_of_Casualty Age_Band_of_Casualty Casualty_Severity Pedestrian_Location Pedestrian_Movement Car_Passenger Bus_or_Coach_Passenger Pedestrian_Road_Maintenance_Worker Casualty_Type Casualty_Home_Area_Type
0 200501BS00001 1 1.0 3.0 1.0 56.0 7.0 2.0 1.0 1.0 0.0 0.0 -1.0 0.0 1.0
1 200501BS00002 1 1.0 2.0 1.0 47.0 7.0 3.0 0.0 0.0 0.0 4.0 -1.0 11.0 1.0
2 200501BS00003 2 1.0 1.0 1.0 62.0 9.0 3.0 0.0 0.0 0.0 0.0 -1.0 9.0 1.0
3 200501BS00004 1 1.0 3.0 2.0 30.0 6.0 3.0 5.0 2.0 0.0 0.0 -1.0 0.0 1.0
4 200501BS00005 1 1.0 1.0 1.0 49.0 8.0 3.0 0.0 0.0 0.0 0.0 -1.0 3.0 -1.0
vehi.describe()
Vehicle_Reference Vehicle_Type Towing_and_Articulation Vehicle_Manoeuvre Vehicle_Location-Restricted_Lane Junction_Location Skidding_and_Overturning Hit_Object_in_Carriageway Vehicle_Leaving_Carriageway Hit_Object_off_Carriageway ... Was_Vehicle_Left_Hand_Drive? Journey_Purpose_of_Driver Sex_of_Driver Age_of_Driver Age_Band_of_Driver Engine_Capacity_(CC) Propulsion_Code Age_of_Vehicle Driver_IMD_Decile Driver_Home_Area_Type
count 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 ... 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000 866223.000000
mean 1.559869 9.667678 0.032816 12.772120 0.123591 2.411406 0.226169 0.309478 0.378220 0.571697 ... 0.951786 10.730304 1.386455 33.147417 5.695801 1339.853980 0.622576 4.089737 3.391802 0.795520
std 0.741835 7.631000 0.312365 6.155444 0.953635 3.100804 0.721344 1.623085 1.400388 2.099478 ... 0.328831 6.420805 0.591698 19.068999 2.933082 1714.813295 1.133182 5.023302 3.714446 1.133638
min 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000
25% 1.000000 9.000000 0.000000 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 1.000000 2.000000 1.000000 21.000000 5.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000
50% 1.000000 9.000000 0.000000 17.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 1.000000 15.000000 1.000000 33.000000 6.000000 1348.000000 1.000000 3.000000 3.000000 1.000000
75% 2.000000 9.000000 0.000000 18.000000 0.000000 5.000000 0.000000 0.000000 0.000000 0.000000 ... 1.000000 15.000000 2.000000 45.000000 7.000000 1799.000000 1.000000 8.000000 7.000000 1.000000
max 22.000000 90.000000 5.000000 18.000000 9.000000 8.000000 5.000000 12.000000 8.000000 10.000000 ... 2.000000 15.000000 3.000000 99.000000 11.000000 99999.000000 10.000000 87.000000 10.000000 3.000000

8 rows × 21 columns

What fraction of accidents occur in urban, rural and other (na) areas?

urban_acci = len(acci[acci['Urban_or_Rural_Area']==1])
rural_acci = len(acci[acci['Urban_or_Rural_Area']==2])
na_acci = len(acci[acci['Urban_or_Rural_Area']==3])
total_acci = urban_acci + rural_acci + na_acci

urban_pct = urban_acci*1.0 / total_acci * 100
rural_pct = rural_acci*1.0 / total_acci *100
na_pct = na_acci*1.0 / total_acci * 100

print("Percentage of accidents occur in urban areas is {0:.0f}%".format(urban_pct))
print("Percentage of accidents occur in rural areas is {0:.0f}%".format(rural_pct))
print("Percentage of accidents occur in other areas is {0:.0f}%".format(na_pct))

x = ['1', '2', '3']
y = [urban_pct, rural_pct, na_pct]
x_pos = list(range(len(x)))
plt.bar(x_pos, y)
plt.ylabel('Percentage of accidents')
plt.xticks(x_pos, x)
plt.title("Percentage of accidents occured by area")
plt.show()
Percentage of accidents occur in urban areas is 81%
Percentage of accidents occur in rural areas is 19%
Percentage of accidents occur in other areas is 0%

When is the most dangerous time to drive?

acci['Hour'] = acci['Time'].map(lambda x: str(x).split(':')[0])
# print(acci['Hour'].describe())

acci['Hour'] = acci['Hour'].apply(pd.to_numeric, errors='coerce')
hour = []
num_of_fatal_acci = []
num_of_acci = []
for i in range(24):
    hour.append(i)
    num_of_fatal_acci_hour = len(acci[(acci['Accident_Severity'] == 1) & (acci['Hour'] == i)])
    num_of_acci_hour = len(acci[acci['Hour'] == i])
    num_of_fatal_acci.append(num_of_fatal_acci_hour)
    num_of_acci.append(num_of_acci_hour)
print(hour)
print(num_of_fatal_acci)
print(num_of_acci)

normalized_num_of_fatal_acci = list(np.array(num_of_fatal_acci) / np.array(num_of_acci) * 100)
# print(max(normalized_num_of_fatal_acci))

fig = plt.figure(figsize=(14,8))

ax1 = fig.add_subplot(221)
ax1.plot(hour, num_of_fatal_acci)
ax1.set_ylabel('Number of fatal accidents')
ax1.set_xlabel('Hour')
ax1.grid(True)

ax2 = fig.add_subplot(222)
ax2.plot(hour, num_of_acci)
ax2.set_ylabel('Number of all accidents')
ax2.set_xlabel('Hour')
ax2.grid(True)

ax3 = fig.add_subplot(223)
ax3.plot(hour, normalized_num_of_fatal_acci)
ax3.set_ylabel('Percentage of fatal accidents in all accidents')
ax3.set_xlabel('Hour')
ax3.grid(True)

plt.show()

print("The most dangerous hour to drive, when most fatal accidents happend in all accidents, is {} o'clock".format(normalized_num_of_fatal_acci.index(max(normalized_num_of_fatal_acci))))   
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
[43, 39, 36, 16, 14, 15, 23, 31, 35, 31, 42, 34, 51, 44, 56, 56, 56, 64, 62, 61, 50, 53, 54, 41]
[1460, 1082, 933, 559, 399, 536, 1222, 3121, 5663, 4144, 3827, 4443, 5351, 5433, 5369, 6882, 7090, 7645, 6085, 4845, 3814, 3029, 2500, 2282]
The most dangerous hour to drive, when most fatal accidents happend in all accidents, is 2 o'clock
#gender_fatal_car = vehi[['Accident_Index','Sex_of_Driver']]
#print (len(gender_fatal_car['Accident_Index']))
temp = acci[['Accident_Index', 'Accident_Severity']]
#print (len(temp))
#gender_fatal_car=gender_fatal_car.head().merge(temp.head(), on = 'Accident_Index', how = 'left')
temp1=vehi[['Sex_of_Driver']]
#print (temp.head())
#print (temp1['Sex_of_Driver'].head())
x=pd.concat([temp,temp1],axis=1)
m = len(x[(x['Sex_of_Driver'] == 1) & (x['Accident_Severity'] == 1)])
f = len(x[(x['Sex_of_Driver'] == 2) & (x['Accident_Severity'] == 1)])
print(m, f)
680 276

What is the trend in the number of accidents that occur each year?

acci['Year'] = acci['Accident_Index'].map(lambda x: str(x)[:4])
acci['Year'] = acci['Year'].apply(pd.to_numeric, errors='coerce')
# print(acci['Year'].head())

year = []
num_of_acci_year = []
for i in range(2005, 2015):
    year.append(i)
    num_of_acci_year.append(len(acci[acci['Year'] == i]))
# print(year)
# print(num_of_acci_year)

plt.plot(year, num_of_acci_year)
plt.xlabel('Year')
plt.ylabel('Number of accidents')
plt.title('Correlation between number of accidents and year')
plt.grid(True)
plt.show()

# slope, intercept = np.polyfit(year, num_of_acci_year, 1)
# print("{:.1f}".format(slope))
# print(intercept)

Do accidents in high-speed-limit areas have more casualties?

# print(set(acci['Speed_limit']))
speed_limit = []
num_casualty = []
num_acci = []
ratio = []

for i in sorted(list(set(acci['Speed_limit']))):
    speed_limit.append(i)
    casualty = acci.loc[acci['Speed_limit'] == i, 'Number_of_Casualties'].sum()
    num_casualty.append(casualty)
    accident = len(acci[(acci['Speed_limit'] == i)])
    num_acci.append(accident)
    r = casualty / accident
    ratio.append(r)
# print(speed_limit)
# print(num_casualty)
# print(num_acci)
# print(ratio)

plt.plot(speed_limit, ratio)
plt.xlabel('Speed limit')
plt.ylabel('Casualty per accident, average')
plt.title('Correlation between casualty per accident and speed limit')
plt.grid(True)
plt.show()
<ipython-input-19-248ccd0298b6>:13: RuntimeWarning: invalid value encountered in longlong_scalars
  r = casualty / accident

How fast do the number of car accidents drop off with age? (Only consider car drivers who are legally allowed to drive in the UK: 17 years or older)

age_acci = vehi[['Accident_Index', 'Age_of_Driver', 'Vehicle_Type']]
# print(age_acci.head())
# print(max(age_acci['Age_of_Driver']))
age = []
num_of_acci = []
for i in range(17, max(age_acci['Age_of_Driver'])+1):
    age.append(i)
    num_of_acci.append(len(age_acci[(age_acci['Age_of_Driver'] == i) & (age_acci['Vehicle_Type'] == 9)]))
# print(age)
# print(num_of_acci)

plt.plot(age, num_of_acci, label = 'Data', marker = 'o')
plt.xlabel('Age')
plt.ylabel('Number of car accidents')
plt.title('Correlation between driver age and number of car accidents')
plt.grid(True)
plt.show()
accidents=acci['Accident_Index'][1:25]
LightConditions=acci['Light_Conditions'][1:25]

fig = plt.figure(figsize=(10,8))
plt.plot(LightConditions,accidents)
plt.xlabel('Light_Conditions')
plt.ylabel('Accidents')
plt.title('no of accidents occurred as per light conditions')
plt.grid(True)
plt.show()