7 高级处理-缺失值处理
Last updated
Last updated
# 读取电影数据
movie = pd.read_csv("./data/IMDB-Movie-Data.csv")pd.notnull(movie)Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore
0 True True True True True True True True True True True True
1 True True True True True True True True True True True True
2 True True True True True True True True True True True True
3 True True True True True True True True True True True True
4 True True True True True True True True True True True True
5 True True True True True True True True True True True True
6 True True True True True True True True True True True True
7 True True True True True True True True True True False Truenp.all(pd.notnull(movie))# 不修改原数据
movie.dropna()
# 可以定义新的变量接受或者用原来的变量名
data = movie.dropna()# 替换存在缺失值的样本的两列
# 替换填充平均值,中位数
# movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean(), inplace=True)for i in movie.columns:
if np.all(pd.notnull(movie[i])) == False:
print(i)
movie[i].fillna(movie[i].mean(), inplace=True)wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)># 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context# 把一些其它值标记的缺失值,替换成np.nan
wis = wis.replace(to_replace='?', value=np.nan)# 删除
wis = wis.dropna()