今天开始准备记录自己看过或者参加的项目,并把其中的一些细节在代码中写出来。
开始,载入需要用的库和数据:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics
df = pd.read_csv('./insurance.csv')
df = df.dropna(how='all')
df.describe()
得到数据分布情况:
df.head()
#查看不同维度之间的相关性
df.corr()
corrMatrix=df.corr()
sns.set(font_scale=1.10)
plt.figure(figsize=(8, 8))
sns.heatmap(corrMatrix, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap='viridis',linecolor="white")
plt.title('Correlation between video num and retain');
df['bmi_int'] = df['bmi'].apply(lambda x: int(x))
variables = ['sex','smoker','region','age','bmi_int','children']
print('数据分布分析:')
for v in variables:
df = df.sort_values(by=[v])
df[v].value_counts().plot(kind = 'bar')
plt.title(v)
plt.show()
print('平均医疗开销分析:')
for v in variables:
group_df = df.groupby(pd.Grouper(key=v)).mean()
group_df = group_df.sort_index()
group_df.plot(y = ['charges'],kind = 'bar')
plt.show()
print('两两变量分析:')
variables = ['sex','smoker','region','age','bmi_int','children','charges']
sns_plot = sns.pairplot(df[variables])
plt.show()
print('建模与评估nn')
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()
df['sex'] = le_sex.fit_transform(df['sex'])
df['smoker'] = le_smoker.fit_transform(df['smoker'])
df['region'] = le_region.fit_transform(df['region'])
variables = ['sex','smoker','region','age','bmi','children']
X = df[variables]
sc = StandardScaler()
X = sc.fit_transform(X)
Y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
regressor = RandomForestRegressor(n_estimators = 200)
regressor.fit(X_train,y_train)
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print('RandomForestRegressor evaluating result:')
print("Train MAE: ", sklearn.metrics.mean_absolute_error(y_train, y_train_pred))
print("Train RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(y_train, y_train_pred)))
print("Test MAE: ", sklearn.metrics.mean_absolute_error(y_test, y_test_pred))
print("Test RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_test_pred)))
print('特征重要度排序n')
importances = regressor.feature_importances_
std = np.std([tree.feature_importances_ for tree in regressor.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
importance_list = []
for f in range(X.shape[1]):
variable = variables[indices[f]]
importance_list.append(variable)
print("%d.%s(%f)" % (f + 1, variable, importances[indices[f]]))
plt.figure()
plt.title("feature importance")
plt.bar(importance_list, importances[indices],
color="r", yerr=std[indices], align="center")
plt.show()
print('在新数据上进行预测nn')
billy = ['male','yes','southeast',25,30.5,2]
print('Billy - ',str(billy))
billy[0] = le_sex.transform([billy[0]])[0]
billy[1] = le_smoker.transform([billy[1]])[0]
billy[2] = le_region.transform([billy[2]])[0]
X = sc.transform([billy])
cost_for_billy = regressor.predict(X)[0]
print('Billy的医疗开销 = ',cost_for_billy,'nn')
dennis = ['female','no','southeast',45,19,0]
print('Dennis - ',str(dennis))
dennis[0] = le_sex.transform([dennis[0]])[0]
dennis[1] = le_smoker.transform([dennis[1]])[0]
dennis[2] = le_region.transform([dennis[2]])[0]
X = sc.transform([dennis])
cost_for_dennis = regressor.predict(X)[0]
print('Dennis的医疗开销 = ',cost_for_dennis)
完。
相关知识
项目一:医疗费用预估
中国宠物医疗项目可行性研究报告
【宠物医疗加盟费】宠物医疗加盟费多少钱?总投资15.7万元!
宠物医院费用详解:宠物治疗项目及收费标准一览
宠物医疗费用全解析:让您的毛孩健康又省钱
宠物店创业项目简介选项目,项目找资源
宠物医疗费用控制如何合理规划宠物医疗支出
宠物的医疗费用是多少?
宠物医疗平台项目商业计划书PPT
在宠物店工作的员工谁来承担狗狗受伤的医疗费用?
网址: 项目一:医疗费用预估 https://m.mcbbbk.com/newsview333523.html
上一篇: 狗狗社交实例,考考大家能否分辨! |
下一篇: 日韩宠物研究报告:发展史给我们带 |