MLSR.data 源代码

import pandas as pd
from numpy import random, zeros
from jieba import suggest_freq, cut


[文档]class DataSet: """ 数据处理的工具类 """ def __init__(self, filename: str = None, encode='gbk'): """ 导入一个数据集,将原始特征、弱标签和细化的强标签分为features, label, strong_label 三个属性。label中特别困难为0,一般困难为1,不困难为2;strong_label将四个细化的困难级别 设为0~3,0最困难,且strong_label中不考虑“不困难”(也就是label=2)的情况。 为和Scikit-learn输入保持一致,无强标签的数据,strong_label=-1。 Args: filename: 文件路径 encode: 文件编码 """ self.features_name = {} if filename is None: self.features = pd.DataFrame() self.label = pd.Series() return data = pd.read_csv(filename, encoding=encode) data = data.apply(lambda x: x.replace("\n", "")) if '专家判定等级' in data.columns: self.strong_label = data['专家判定等级'] - 1 self.label = self.strong_label // 2 else: self.strong_label = pd.Series([-1] * len(data)) self.label = data['院系认定贫困类型'].apply(lambda x: 0 if '特' in x else 1) self.features = data.drop(['院系认定贫困类型', '专家判定等级'], axis=1, errors='ignore')
[文档] def merge(self, y): """ 将一个DataSet加入当前的DataSet尾部 Args: y: 要加入的DataSet Returns: 新的DataSet """ self.features.append(y.features, ignore_index=True) self.label.append(y.label, ignore_index=True) self.strong_label.append(y.strong_label, ignore_index=True) return self
[文档] @staticmethod def static_merge(x, y): """ 将DataSet y拼接到DataSet x后面,返回一个新的数据集 Args: x: DataSet y: DataSet Returns: 新的DataSet """ z = DataSet() z.features_name = x.features_name z.features = pd.concat([x.features, y.features], ignore_index=True) z.label = pd.concat([x.label, y.label], ignore_index=True) z.strong_label = pd.concat([x.strong_label, y.strong_label], ignore_index=True) return z
[文档] def split_by_weak_label(self, reset_index: bool = True): """ 将初次分类的得到的结果中,评为特别困难和一般困难的分开挑出来 Returns: (DataSet, DataSet) 两个DataSet对象,第一个是特别困难,第二个是一般困难 """ x0 = DataSet() x1 = DataSet() index = self.label[self.label == 0].index x0.features = self.features.take(index) x0.label = self.label.take(index) x0.strong_label = self.strong_label.take(index) index = self.label[self.label == 1].index x1.features = self.features.take(index) x1.label = self.label.take(index) x1.strong_label = self.strong_label.take(index) x0.features_name = self.features_name x1.features_name = self.features_name if reset_index: return x0.reset_index(), x1.reset_index() else: return x0, x1
[文档] def reset_index(self): """ 将数据及重新标号 Notes: 调用pandas.reset_index时inplace为True Returns: 重新标号后的DataSet """ self.features.reset_index(drop=True, inplace=True) self.label.reset_index(drop=True, inplace=True) self.strong_label.reset_index(drop=True, inplace=True) return self
[文档] def convert_to_ssl(self): """将数据集转为半监督任务用的数据集 将弱标签作为一个新的特征,并且删去非困难的数据 Returns: DataSet对象 """ len_name = len(self.features_name) self.features_name['f'+str(len_name)] = '院系认定贫困类型' self.features['f'+str(len_name)] = self.label index = self.label[self.label == 2].index self.features.drop(index, inplace=True) self.label.drop(index, inplace=True) self.strong_label.drop(index, inplace=True) return self.reset_index()
@staticmethod @DeprecationWarning def special_init(data: pd.DataFrame) -> pd.DataFrame: """ 我们使用的数据集中的特殊处理 一共有16个特征,发现第1772个数据在原数据集里列填串了,第8297个数据无标签,删去 院系、专业、出生年月、校区没有用,删掉 Warnings: 这个函数没什么用了,之后删掉 Args: data:待处理的pandas.DataFrame Returns:特殊处理后的pandas.DataFrame """ data = data[data.columns[:16]] data.drop(labels=[1772, 8297], inplace=True, errors='ignore') data.drop(['院系', '专业', '出生年月', '所在校区'], axis=1, inplace=True, errors='ignore') return data @staticmethod @DeprecationWarning def shuffle_and_pick(data, out_path: str = 'rand_select') -> tuple: """ 随机打乱后随机抽取的400个样本,用于人工再标注细化标签 Warnings: 这个函数写得不太好,之后删掉 Args: data:待处理数据,pandas.DataFrame out_path:保存文件路径 Returns:划分后的两个pandas.DataFrame """ data['rand'] = random.uniform(0, 1, len(data)) data.sort_values(by="rand", inplace=True) # 对data随机排序 data.drop('rand', axis=1, inplace=True) weak_data = data[:400] strong_data = data[400:] weak_data.to_csv(out_path + '_weak.csv', encoding='utf-8') strong_data.to_csv(out_path + '_strong.csv', encoding='utf-8') return weak_data, strong_data
[文档] @staticmethod def do_nation_policy(data: pd.DataFrame) -> pd.DataFrame: """ 处理“享受国家政策资助情况”一列 Args: data: 待处理的pandas.DataFrame,建议传入所有特征 Returns:pandas.DataFrame,分类哑变量 """ d = pd.DataFrame() d["建档立卡贫困户"] = data["享受国家政策资助情况"].str.contains("立卡", na=False) d["城乡低保户"] = data["享受国家政策资助情况"].str.contains("低保", na=False) if '家庭主要经济来源' in data.columns: d['城乡低保户'] |= data['家庭主要经济来源'].str.contains('低保|最低生活保障', na=False) d["五保户"] = data["享受国家政策资助情况"].str.contains("五保", na=False) d["孤残学生"] = data["享受国家政策资助情况"].str.contains("孤残", na=False) if '突发事件情况' in data.columns: d['孤残学生'] |= data["突发事件情况"].str.contains("父母双亡|父母去世|孤残|孤儿|重大疾病、突发意外致残|本人视力残疾|本人严重烫伤", na=False) d["军烈属或优抚子女"] = data["享受国家政策资助情况"].str.contains("军烈属", na=False) return d
[文档] @staticmethod def do_income(data: pd.DataFrame, fill_to_no_income: bool = True) -> pd.DataFrame: """ 家庭主要经济来源 Args: data: fill_to_no_income: Returns: """ d = pd.DataFrame() d["家庭主要经济来源"] = data["家庭主要经济来源"] if fill_to_no_income: d["家庭主要经济来源"].fillna('父母均下岗', inplace=True) business_str = '生意|经营|从商|经商|地摊|摆摊|杂货铺|店|卖|' + \ '买|个体|餐|理发|手工|个体|水果摊|蒸馒头|股票' d['经商'] = data['家庭主要经济来源'].str.contains(business_str, na=False) farm_str = '务农|农作|农民|农收|农业|农务|农村|农活|耕|种植|种地|' + \ '种粮|庄稼|田|农产品|土地|葡萄|果树|果园|畜|玉米|梨园|牧|养殖|苹果|枣' d['务农'] = data['家庭主要经济来源'].str.contains(farm_str, na=False) retire_str = '退休|养老|退养|病休|内退|病退|退职' d['退休'] = data['家庭主要经济来源'].str.contains(retire_str, na=False) low_str = '低保|最低生活保障' d['低保'] = data['家庭主要经济来源'].str.contains(low_str, na=False) work_list = [ '城镇', '父母劳作', '家长工资', '父亲、母亲', '父母微薄收入', '工薪', '基本工资', '职工工资', '父母工资', '父母工作收入', '父母', '工作', '父母工作', '父母收入', '工资收入', '工资' ] work_str = '打工|务工|农民工|工地|零工|临时工|工人|临工|短工|小工|散工|' + \ '出租车|货车|教师|苦力|司机|体力劳动|保安|看守|送货|公交车|裁缝|' + \ '保姆|上班|工活|教书|清洁工|营业员|城市|普通职工|诊所|超市工作|' + \ '跑保险|打杂|干活|杂工|十字绣|代教|职员|瓷砖|看门|建房子|职工|' + \ '房屋出租|房租|自由职业|副业|父母工资收入|父母劳动收入|劳务报酬|' + \ '父母的工资|做工|劳动收入|卫生所从医|工作收入|不固定|不稳定|' + \ '无稳定|非固定|非稳定|无固定|没有固定' d['打工'] = data['家庭主要经济来源'].apply(lambda x: True if x in work_list else False) d['打工'] |= data['家庭主要经济来源'].str.contains(work_str, na=False) both_unemployed_list = [ '未写', '暂无', '无', '兄长', '姐姐的工资', '哥哥工作', '姐姐 哥哥', '姐姐的工资收入', '哥哥工资', '本人及奶奶的低保金', '姐姐工资', '现靠父母过去的工资', '靠姑姑接济', '父亲无固定工作,现停业在家\n母亲一直无工作' ] both_unemployed_str = '父母无业|双方失业|父母均无业|父母下岗|均下岗|双下岗|' + \ '亲友|接济|救济|资助|勤工俭学|经济扶持|补助|补贴|' + \ '寄养家庭|父母离岗工资|社保' d['父母均下岗'] = data['家庭主要经济来源'].apply(lambda x: True if x in both_unemployed_list else False) d['父母均下岗'] |= data['家庭主要经济来源'].str.contains(both_unemployed_str, na=False) one_unemployed_list = [ '父母一方下岗', '父亲每月工资', '母亲基本工资', '父亲的薪水', '父亲基本工资', '父亲和兄长收入', '父亲姐姐工资', '母亲单位工资', '父亲上岗', '父亲劳务派遣', '父亲固定工资收入', '父亲个人工资', '父亲微薄工资', '4050公益岗位收入', '父兄工资', '爸爸', '父亲的收入', '父亲的工作', '父亲', '母亲' ] one_unemployed_str = '一方|母亲固定收入|父亲固定收入|母亲下岗|父亲下岗|' + \ '父亲失业|母亲失业|一人工资|一人的工资|爸爸工资|妈妈工资|' + \ '爸爸的工资|妈妈的工资|父亲的工资|父亲工资|母亲的工资|' + \ '母亲工资|父亲工作|母亲工作|爸爸工作|妈妈工作|父亲上班|' + \ '母亲上班|父亲收入|母亲收入|父亲无业' d['父母一方下岗'] = data['家庭主要经济来源'].apply(lambda x: True if x in one_unemployed_list else False) d['父母一方下岗'] |= data['家庭主要经济来源'].str.contains(one_unemployed_str, na=False) d['家庭人均年收入'] = data['家庭人均年收入'] d.drop('家庭主要经济来源', inplace=True, axis=1, errors='ignore') return d
[文档] @staticmethod def do_education(s: pd.Series) -> pd.DataFrame: """ 对每一行用jieba进行分词,对结果进行遍历,搜索关键词并记录其词性,记为cut_type 在cut_type中找寻如下pattern,并总结出大学阶段、高中阶段、义务教育阶段各有多少人: 个数和家庭成员都有可能出现,但个数为家庭成员前一个词,因此先检测个数再紧跟着检测家庭成员 年级和学校都有可能出现,但年级一定出现在学校之后,因此先检测学校再检测年级 个数 -> 学校/年级/学校&年级 -> 非学校或年级:这几个人都属于该学校 (个数 ->)家庭成员 -> 学校/年级/学校&年级 -> 非学校或年级:这几个人都属于该学校 (个数 ->)家庭成员 -> 学校/年级/学校&年级 -> 学校/年级/学校&年级 -> 非学校或年级:首先保证两个学校阶段相同,则这种家庭成员分别属于这个阶段;否则人工处理 Args: s: 输入的pandas.Series Returns: """ s = s.fillna('无') zero_tmp1 = ['暂无', '独生', '无在读', '无在受', '无其他', '无成员', '无高中', '无正在'] zero_tmp2 = ['0', '0人', '0人在读高中或大学'] def zero_fun(x): if x in zero_tmp2: return True else: for _i in zero_tmp1: if _i in x: return True return False s = s.apply(lambda x: '无' if zero_fun(x) else x) # 创建关键字 number1 = {"一个", "1个", "一人", "1人", "一位", "1位"} number2 = {"两个", "2个", "两人", "2人", "二人", "两位", "2位", "二位"} number3 = {"三个", "3个", "三人", "3人", "三位", "3位"} number4 = {"四个", "4个", "四人", "4人", "四位", "4位"} member = {"哥哥", "姐姐", "弟弟", "妹妹", "侄女", "侄子"} sp_member = {"哥", "兄", "姐", "弟", "妹", "大哥", "二哥", "大弟", "二弟", "三弟", "四弟", "五弟", "小弟", "大姐", "长姐", "二姐", "三姐", "大妹", "小妹", "二妹", "三妹", "四妹"} invalid_member = {"爸爸", "父亲", "妈妈", "母亲", "爷爷", "奶奶", "外祖父", "外祖母", "姥爷", "姥姥", "伯伯", "婆婆", "外公", "外婆"} invalid_sp_member = {"爸", "妈", "爷", "奶", "祖父", "祖母", "父", "母"} grad = {"幼儿园毕业", "刚毕业", "小学毕业", "小学未毕业", "初中毕业", "初中未毕业", "高中毕业", "大专毕业", "专科毕业", "大学毕业", "三本毕业", "应届毕业", "大学已毕业", "大学刚毕业", "本科毕业", "研究生毕业", "硕士毕业", "博士毕业"} sp_grad = {"毕业", "未受教育", "未接受教育", "文盲", "无学历", "没上过学", "肄业", "未上学"} college = {"研究生", "读研", "博士", "硕士", "大学", "本科", "专升本", "大学生", "河工大"} sp_college = {"考研", "大专", "学院", "高职", "专科", "职业技术学校"} gr_college = {"大一", "大二", "大三", "大四", "研一", "研二", "研三", "博二", "读博"} high_school = {"高中", "高专", "职高", "职专", "职中", "中专", "中学", "职业高中"} gr_high_school = {"高一", "高二", "高三", "高考"} compulsory = {"初中", "小学"} gr_compulsory = {"初一", "初二", "初三", "初中三年级", "一年级", "二年级", "三年级", "四年级", "五年级", "六年级", "七年级", "八年级", "九年级", "义务教育阶段"} others = {"幼儿园", "学前班", "学前教育"} # 手动检查了前200个数据,发现一些分词结果有误 suggest_word = [ '兄弟', '高二', '小学毕业', '初中毕业', '高中毕业', '职高毕业', '大学毕业', '大专毕业', '1人', '1个', '1位', '2人', '2个', '2位', '3人', '3个', '3位', '4人', '西安交通大学', '海南师范大学', '河工大', '中国科学院', '北京航空航天大学', '北京大学', '义务教育阶段' ] suggest_split = [ ('父', '母'), ('兄', '妹'), ('兄', '妹'), ('姐', '弟'), ('姐', '妹'), ('读', '高二'), ('中医药', '大学'), ('农', '学院') ] for i in suggest_word: suggest_freq(tuple(i), True) for i in suggest_split: suggest_freq(i, True) # 对每个字符串进行分词,并且找寻其中关键字,记录关键字的词性和位置 arr = zeros(shape=(len(s), 3)) df = pd.DataFrame(arr, columns=["大学", "高中", "义务教育"]) row = -1 # 记录str在s中的位置 for str_i in s: row += 1 seg_list = cut(str_i, cut_all=False) # 对每个字符串进行分词 output = list(seg_list) cut_type = [] # 记录某个关键字的词性 cut_loc = [] # 记录某个关键字在str中的位置 loc = 0 # loc为某个词在原字符串中的位置(1~len) for cut_i in output: # cut为分词结果中的每个词 loc += 1 if cut_i in number1: cut_type.append("number1") cut_loc.append(loc - 1) # cut在原字符串中的索引值(0~len-1) elif cut_i in number2: cut_type.append("number2") cut_loc.append(loc - 1) elif cut_i in number3: cut_type.append("number3") cut_loc.append(loc - 1) elif cut_i in number4: cut_type.append("number4") cut_loc.append(loc - 1) elif cut_i in member: cut_type.append("member") cut_loc.append(loc - 1) elif cut_i in sp_member: cut_type.append("sp_member") cut_loc.append(loc - 1) elif cut_i in invalid_member: cut_type.append("invalid_member") cut_loc.append(loc - 1) elif cut_i in invalid_sp_member: cut_type.append("invalid_sp_member") cut_loc.append(loc - 1) elif cut_i in grad: cut_type.append("grad") cut_loc.append(loc - 1) elif cut_i in sp_grad: cut_type.append("sp_grad") cut_loc.append(loc - 1) elif cut_i in college: cut_type.append("college") cut_loc.append(loc - 1) elif cut_i in sp_college: cut_type.append("sp_college") cut_loc.append(loc - 1) elif cut_i in gr_college: cut_type.append("gr_college") cut_loc.append(loc - 1) elif cut_i in high_school: cut_type.append("high_school") cut_loc.append(loc - 1) elif cut_i in gr_high_school: cut_type.append("gr_high_school") cut_loc.append(loc - 1) elif cut_i in compulsory: cut_type.append("compulsory") cut_loc.append(loc - 1) elif cut_i in gr_compulsory: cut_type.append("gr_compulsory") cut_loc.append(loc - 1) elif cut_i in others: cut_type.append("others") cut_loc.append(loc - 1) serial = 0 cut_type.append(" ") cut_type.append(" ") # 确保检测到最后一位也能检测其后两位的元素的词性 u_num = 0 # 大学生人数 h_num = 0 # 高中生人数 c_num = 0 # 义务教育人数 number = 0 # 待定的人数 while serial < len(cut_type) - 2: current_group = [] # 每一个pattern起始的词只能为个数或家庭成员 if (cut_type[serial] in ["number1", "number2", "number3", "number4", "member", "sp_member", "invalid_member", "invalid_sp_member"]): current_group.append(cut_type[serial]) for forward in range(serial + 1, len(cut_type) - 1): # 开始检测该pattern内后面的词 current_group.append(cut_type[forward]) if ((cut_type[forward] in [" ", "grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"]) and ( cut_type[forward + 1] in [" ", "number1", "number2", "number3", "number4", "member", "sp_member", "invalid_member", "invalid_sp_member"])): serial = forward + 1 # 找到学校或年级 -> 非学校或年级,这一组完成,定位至下一组首个词 break # 此时一组已经检测完成,对其进行匹配 group_serial = 0 current_group.append(" ") current_group.append(" ") # 确保检测到最后一位也能检测其后两位的元素是否为学校 while group_serial < len(current_group) - 2: # 个数 -> 学校 -> 非学校或年级 if ((current_group[group_serial] in ["number1", "number2", "number3", "number4"]) and ( current_group[group_serial + 1] in ["college", "sp_college", "gr_college"])): if current_group[group_serial] == "number1": u_num += 1 elif current_group[group_serial] == "number2": u_num += 2 elif current_group[group_serial] == "number3": u_num += 3 elif current_group[group_serial] == "number4": u_num += 4 elif ((current_group[group_serial] in ["number1", "number2", "number3", "number4"]) and ( current_group[group_serial + 1] in ["high_school", "gr_high_school"])): if current_group[group_serial] == "number1": h_num += 1 elif current_group[group_serial] == "number2": h_num += 2 elif current_group[group_serial] == "number3": h_num += 3 elif current_group[group_serial] == "number4": h_num += 4 elif ((current_group[group_serial] in ["number1", "number2", "number3", "number4"]) and ( current_group[group_serial + 1] in ["compulsory", "gr_compulsory"])): if current_group[group_serial] == "number1": c_num += 1 elif current_group[group_serial] == "number2": c_num += 2 elif current_group[group_serial] == "number3": c_num += 3 elif current_group[group_serial] == "number4": c_num += 4 # (个数 ->) 家庭成员 -> 学校 -> 非学校 elif ((current_group[group_serial] in ["member", "sp_member"]) and ( current_group[group_serial + 1] in ["grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"]) and ( current_group[group_serial + 2] not in ["grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"])): if current_group[group_serial - 1] == "number2": number += 2 elif current_group[group_serial - 1] == "number3": number += 3 elif current_group[group_serial - 1] == "number4": number += 4 else: number += 1 if current_group[group_serial + 1] in ["college", "sp_college", "gr_college"]: u_num += number elif current_group[group_serial + 1] in ["high_school", "gr_high_school"]: h_num += number elif current_group[group_serial + 1] in ["compulsory", "gr_compulsory"]: c_num += number number = 0 # 家庭成员 -> 多个学校 -> 非学校 elif ((current_group[group_serial] in ["member", "sp_member"]) and ( current_group[group_serial + 1] in ["grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"]) and ( current_group[group_serial + 2] in ["grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"])): group_serial += 1 while group_serial < len(current_group) - 2: if (current_group[group_serial] not in ["grad", "sp_grad", "college", "sp_college", "gr_college", "high_school", "gr_high_school", "compulsory", "gr_compulsory", "others"]): break elif current_group[group_serial] in ["college", "sp_college", "gr_college"]: u_num += 1 elif current_group[group_serial] in ["high_school", "gr_high_school"]: h_num += 1 elif current_group[group_serial] in ["compulsory", "gr_compulsory"]: c_num += 1 group_serial += 1 # 多个家庭成员 -> 学校 -> 非学校 elif ((current_group[group_serial] in ["member", "sp_member"]) and ( current_group[group_serial + 1] in ["member", "sp_member", "invalid_member", "invalid_sp_member"])): group_serial += 1 while group_serial < len(current_group) - 2: if (current_group[group_serial] not in ["member", "sp_member", "invalid_member", "invalid_sp_member"]): break elif current_group[group_serial] in ["member", "sp_member"]: number += 1 group_serial += 1 if current_group[group_serial] in ["college", "sp_college", "gr_college"]: u_num += number elif current_group[group_serial] in ["high_school", "gr_high_school"]: h_num += number elif current_group[group_serial] in ["compulsory", "gr_compulsory"]: c_num += number number = 0 group_serial += 1 else: # 该词不为个数或家庭成员,则访问下一个 serial += 1 df.iloc[row] = [u_num, h_num, c_num] return df
[文档] @staticmethod def do_accident(s: pd.Series): """ 识别突发事件情况 部分处理思路如下: 在cut_type中找寻如下pattern,并总结出每个人发生什么事情: 每一个人可能同时做多件事情,而这多件事情肯定是按顺序排列的,这些事情之间一定不出现另一个人名 无论在哪里出现divorce,一定为父母离异,但是如果出现在“父母”之后,则需要把这个“父母”与divorce绑定 这里jieba无法将“父母”、“父/母”、“父(母)”、“父亲(母亲)”分开,所以需要加一个判断条件,会用在后面无业、患病、去世中 一个人之后可能跟着多个illness,应全部与其绑定。若人是祖父母,则统计其是否患病;父母则看是否有重病,且应将父母辨别开;兄弟姐妹只统计重疾。 有可能出现人 -> illness ->dead。所有人都有可能dead,dead需要与之前最近的一个人或连续的多个人绑定,但只统计父或母去世。 Args: s:待处理的pandas.Series Returns:处理后得到的哑变量特征,pandas.Dataframe格式 """ s = s.fillna('无') zero_tmp1 = ['无', '否', '没有', '正常', '暂无'] s = s.apply(lambda x: '无' if x in zero_tmp1 else x) # 创建关键字 dad = {"爸爸", "父亲", "爸", "父"} mom = {"妈妈", "母亲", "妈", "母"} grand_parents = {"老人", "长辈", "祖父母", "爷爷", "奶奶", "外祖父", "外祖母", "姥爷", "姥姥", "外公", "外婆"} sp_grand_parents = {"爷", "奶", "祖父", "祖母"} siblings = {"哥哥", "姐姐", "弟弟", "妹妹"} sp_siblings = {"哥", "兄", "姐", "弟", "妹", "大哥", "二哥", "大弟", "二弟", "三弟", "四弟", "五弟", "小弟", "大姐", "长姐", "二姐", "三姐", "大妹", "小妹", "二妹", "三妹", "四妹"} invalid_member = {"我", "本人", "自己", "侄女", "侄子", "伯伯", "伯母", "大伯", "二伯", "三伯", "伯父", "婆婆", "舅舅", "小姑", "姑姑", "二姑", "大舅", "叔叔", "叔父", "二叔", "老叔", "舅妈"} divorce = {"单亲", "离婚", "离异"} unemployed = {"无业", "一方无业", "均无业", "失业", "无法工作", "下岗", "公司破产", "无工作", "待业", "倒闭", "离职", "未有收入", "无收入", "停产", "失去稳定工作", "没能工作"} dead = {"去世", "离世", "病逝", "死亡", "治丧", "身亡", "病故"} illness = { "病", "病了", "就医", "发病", "有病", "多病", "车祸", "住院", "养病", "疾病", "病情", "受伤", "顽疾", "服药", "腰伤", "慢性疾病", "普通疾病", "一般疾病", "生病", "患病", "带病", "患疾", "皮肤病", "高血压", "高血糖", "高血脂", "风湿", "类风湿", "风湿病", "心脏病", "糖尿病", "高血脂", "三高", "囊肿", "肝囊肿", "结石", "肾结石", "胆结石", "结石病", "尿结石", "肾囊肿", "肾积水", "脑溢血", "脑血栓", "心脑血管疾病", "心脑疾病", "青光眼", "慢阻肺", "中风", "白内障", "肺结核", "冠心病", "甲亢", "癫痫", "股骨头坏死", "风湿", "腿脚不便", "精神病", "精神疾病", "精神分裂症", "精神性疾病", "气胸", "胃穿孔", "骨折", "骨裂", "红斑狼疮", "腰椎间盘突出", "腰间盘突出", "关节炎", "骨质增生", "胃溃疡", "手术", "腿疾", "胃病", "感染", "胰腺炎", "溃烂", "摔伤", "腿伤", "睡眠障碍", "工伤", "视网膜", "白癜风", "关节病", "颈椎病", "胆囊炎", "坠楼", "瘸", "贫血", "脱髓鞘", "事故", "意外事故", "服药", "体弱", "卷入机器", "气管炎", "支气管炎", "卧病", "交通事故", "吃药", "胃出血", "脑出血", "颅内出血", "子宫肌瘤", "腰椎", "颈椎", "腰椎病", "后遗症", "割伤", "脑垂体瘤", "脊椎炎", "扎伤", "烫伤", "肺气肿", "卧床", "断裂", "眼疾", "伤手", "摔了", "吃药", "旧病复发", "切断", "摔到", "意外事故"} serious_illness = { "大病", "病重", "重病", "重疾", "重大疾病", "肌无力", "肿瘤", "瘤", "白血病", "癌", "患癌", "癌症", "肝癌", "食道癌", "卵巢癌", "甲状腺癌", "肺癌", "宫颈癌", "脑癌", "直肠癌", "乳腺癌", "胃癌", "肺腺癌", "贲门癌", "喷门癌", "食道癌", "肠癌", "乳癌", "结肠癌", "前列腺癌", "致癌", "肾癌", "淋巴癌", "食道癌", "心梗", "心肌梗塞", "脑中风", "移植", "搭桥", "支架", "肾炎", "肾病综合征", "肾综合", "严重肾病", "肾衰竭", "尿毒症", "截肢", "肝病", "肝硬化", "肝炎", "干重活", "做重活", "不能工作", "失去劳动力", "丧失劳动力", "丧失行动力", "无法劳作", "丧失劳动能力", "失去部分劳动力", "失去全部劳动力", "无法承受过重劳动", "失去行动能力", "干不了", "干重活", "不能劳作", "不得剧烈运动", "脑梗", "脑梗塞", "脑梗死", "脑膜炎", "脑膜瘤", "昏迷", "聋", "失聪", "耳聋", "聋哑人", "聋哑", "失明", "瘫痪", "偏瘫", "脑瘫", "截瘫", "致瘫", "帕金森", "瓣膜病", "痴呆", "老年痴呆", "老年痴呆症", "烧伤", "火烧", "语言", "贫血", "主动脉", "残疾", "残废", "伤残", "摔断", "砸断", "神志不清", "骨髓瘤", "致残", "脑萎缩", "脑血管", "脑结核", "半身不遂", "致盲", "病危", "再生性贫血障碍", "生活无法自理", "做手术" } suggest_word = [ '严重肾病', '肾病综合征', '失去劳动力', '丧失劳动力', '丧失行动力', '无法劳作', '不能劳作', '失去部分劳动力', '失去部分劳动力', '失去行动能力', '丧失劳动能力', '无法承受过重劳动', '心脑血管疾病', '心脑疾病', '肺腺癌', '脑梗', '不得剧烈运动', '重大疾病', '普通疾病', '一般疾病', '一方无业', '均无业', '股骨头坏死', '干重活', '做重活', '腿脚不便', '无法工作', '工作', '精神性疾病', '精神官能症', '公司破产', '腰椎间盘突出', '腰间盘突出', '腿疾', '无工作', '慢性疾病', '睡眠障碍', '卷入机器', '颅内出血', '子宫肌瘤', '未有收入', '脑垂体瘤', '病了', '再生性贫血障碍', '摔了', '生活无法自理', '失去稳定工作', '没能工作', '旧病复发', '摔到' ] for _word in suggest_word: suggest_freq(tuple(_word), True) suggest_split = [ ('癫痫', '病'), ('卧床', '不起'), ('黑色素', '瘤'), ('单亲', '家庭'), ('恶性', '肿瘤'), ('断', '腿'), ('断', '手'), ('家', '父'), ('家', '母') ] for _split in suggest_split: suggest_freq(_split, True) # 对每个字符串进行分词,并且找寻其中关键字,记录关键字的词性和位置 arr = zeros(shape=(len(s), 11)) df = pd.DataFrame(arr, columns=[ "祖父母患病", "父母离异", "父亲(母亲)患普通疾病", "父母患普通疾病", "父亲(母亲)无业", "父母均无业", "兄弟姐妹患重疾", "父亲(母亲)患重疾", "父母患重疾", "父亲(母亲)去世", "突发重大自然灾害" ]) index = s.str.contains( "灾|病虫害|霜冻|地震|台风|洪水|大水|大旱|干旱|冰雹|暴风雨|暴雨|下雪|雷劈|自然状况|" + "禽流感|高温|减产|倒伏|淹|涝|庄稼大量死亡|自然状况|自然天气状况|减产|泥石流|猪瘟|庄稼无收") adjusted_index = 0 # 由于index中少了一些索引,无法直接匹配到df中,需要调整index for i in index: if i: df.loc[adjusted_index, "突发重大自然灾害"] = 1 adjusted_index += 1 row = -1 # 记录str在s中的位置 for _str in s: row += 1 seg_list = cut(_str, cut_all=False) # 对每个字符串进行分词 output = list(seg_list) cut_type = [] # 记录某个关键字的词性 cut_loc = [] # 记录某个关键字在str中的位置 loc = 0 # loc为某个词在原字符串中的位置(1~len) for _cut in output: # cut为分词结果中的每个词 loc += 1 if _cut in dad: cut_type.append("dad") cut_loc.append(loc - 1) # cut在原字符串中的索引值(0~len-1) elif _cut in mom: cut_type.append("mom") cut_loc.append(loc - 1) elif _cut in grand_parents: cut_type.append("grand_parents") cut_loc.append(loc - 1) elif _cut in sp_grand_parents: cut_type.append("sp_grand_parents") cut_loc.append(loc - 1) elif _cut in siblings: cut_type.append("siblings") cut_loc.append(loc - 1) elif _cut in sp_siblings: cut_type.append("sp_siblings") cut_loc.append(loc - 1) elif _cut in invalid_member: cut_type.append("invalid_member") cut_loc.append(loc - 1) elif _cut in divorce: cut_type.append("divorce") cut_loc.append(loc - 1) elif _cut in unemployed: cut_type.append("unemployed") cut_loc.append(loc - 1) elif _cut in dead: cut_type.append("dead") cut_loc.append(loc - 1) elif _cut in illness: cut_type.append("illness") cut_loc.append(loc - 1) elif _cut in serious_illness: cut_type.append("serious_illness") cut_loc.append(loc - 1) serial = 0 cut_type.append(" ") cut_type.append(" ") # 确保检测到最后一位也能检测其后两位的元素的词性 cut_loc.append(" ") cut_loc.append(" ") while serial < len(cut_type) - 2: flag_dad = False flag_mom = False flag_parents = False # 判断是父或母还是父与母 flag_grand_parents = False flag_sp_grand_parents = False flag_siblings = False flag_sp_siblings = False flag_divorce = False flag_unemployed = False flag_dead = False flag_illness = False flag_serious_illness = False # 用来检测每组中是否出现相应的关键词,若有则按照规则赋予相应的类1 current_group = [] current_loc = [] # 每一个pattern起始的词只能为家庭成员 if (cut_type[serial] in [ "dad", "mom", "grand_parents", "sp_grand_parents", "siblings", "sp_siblings", "invalid_member" ]): current_group.append(cut_type[serial]) current_loc.append(cut_loc[serial]) # 用来判断父母是否连着 for forward in range(serial + 1, len(cut_type) - 1): # 开始检测该pattern内后面的词 current_group.append(cut_type[forward]) current_loc.append(cut_loc[forward]) if ((cut_type[forward] in [" ", "divorce", "unemployed", "dead", "illness", "serious_illness"]) and ( cut_type[forward + 1] in [" ", "dad", "mom", "grand_parents", "sp_grand_parents", "siblings", "sp_siblings", "invalid_member"])): serial = forward + 1 # 找到行为 -> 人,这一组完成,定位至下一组首个词 break # 此时一组已经检测完成,对其进行匹配 group_serial = 0 current_group.append(" ") current_group.append(" ") # 确保检测到最后一位也能检测其后两位的元素 current_loc.append(" ") current_loc.append(" ") current_loc.append(" ") current_loc.append(" ") while group_serial < len(current_group) - 2: # 一个或多个家庭成员 -> 一件或多件事情 -> 非事情 if (current_group[group_serial] in ["dad", "mom", "grand_parents", "sp_grand_parents", "siblings", "sp_siblings", "invalid_member", "divorce", "unemployed", "dead", "illness", "serious_illness"]): if (current_group[group_serial + 1] not in [" ", "dad", "mom", "grand_parents", "sp_grand_parents", "siblings", "sp_siblings", "invalid_member", "divorce", "unemployed", "dead", "illness", "serious_illness"]): break if current_group[group_serial] == "dad": flag_dad = True if ((current_group[group_serial + 1] == "mom") and ( current_loc[group_serial + 1] == current_loc[group_serial] + 1)): flag_parents = True elif current_group[group_serial] == "mom": flag_mom = True if ((current_group[group_serial + 1] == "dad") and ( current_loc[group_serial + 1] == current_loc[group_serial] + 1)): flag_parents = True elif current_group[group_serial] == "grand_parents": flag_grand_parents = True elif current_group[group_serial] == "sp_grand_parents": flag_sp_grand_parents = True elif current_group[group_serial] == "siblings": flag_siblings = True elif current_group[group_serial] == "sp_siblings": flag_sp_siblings = True elif current_group[group_serial] == "divorce": flag_divorce = True elif current_group[group_serial] == "unemployed": flag_unemployed = True elif current_group[group_serial] == "dead": flag_dead = True elif current_group[group_serial] == "illness": flag_illness = True elif current_group[group_serial] == "serious_illness": flag_serious_illness = True # 一个pattern中的词已经全部找到,对该pattern进行考察 if ((flag_grand_parents or flag_sp_grand_parents) and ( flag_illness or flag_serious_illness)): df.loc[row, "祖父母患病"] = 1 if flag_divorce: df.loc[row, "父母离异"] = 1 if flag_illness and (flag_dad or flag_mom) and not flag_parents: df.loc[row, "父亲(母亲)患普通疾病"] = 1 if flag_illness and flag_dad and flag_mom and flag_parents: df.loc[row, "父母患普通疾病"] = 1 if flag_unemployed and (flag_dad or flag_mom) and not flag_parents: df.loc[row, "父亲(母亲)无业"] = 1 if flag_unemployed and flag_dad and flag_mom and flag_parents: df.loc[row, "父母均无业"] = 1 if flag_serious_illness and (flag_siblings or flag_sp_siblings): df.loc[row, "兄弟姐妹患重疾"] = 1 if flag_serious_illness and (flag_dad or flag_mom) and not flag_parents: df.loc[row, "父亲(母亲)患重疾"] = 1 if flag_serious_illness and flag_dad and flag_mom and flag_parents: df.loc[row, "父母患重疾"] = 1 if flag_dead and (flag_dad or flag_mom) and not flag_parents: df.loc[row, "父亲(母亲)去世"] = 1 group_serial += 1 else: # 该词不为个数或家庭成员,则访问下一个 serial += 1 return df
[文档] @staticmethod def do_scholarship(s: pd.Series) -> pd.DataFrame: """ 识别在校期间获得助学金情况 Args: s:待处理的特征,pandas.Series Returns:返回三个特征的pandas.DataFrame,助学金个数(离散),助学金总金额(连续), 获得的国家助学金类型(分类变量,0为未获得,1为国家二等助学金,2为国家一等助学金) """ d = pd.DataFrame() d['在校受奖励资助情况'] = s.fillna('无') scholar_map = { '慧明': 5000, '欧莱雅': 5000, '喜来健': 5000, '中海油': 5000, '承锋': 5000, '清茗雅轩': 3000, '盛帆': 3000, '福慧': 2000, '柏年': 2000, '圣恩纳': 2000, '香港好友': 2000, '国泰': 5000, '思源': 4000, '宋声扬': 5000, '长城': 3000, '交通': 2500, '冯顾丽华': 2000, '电装': 3000, '圆梦启航': 6600 } def func(x): cnt = 0 tot = 0 is_national = 0 for k, v in scholar_map.items(): if k in x: cnt += 1 tot += v if '国家' in x or '国助' in x: cnt += 1 if "一" in x: is_national = 2 tot += 3800 else: is_national = 1 tot += 2800 return cnt, tot, is_national d['tmp'] = d['在校受奖励资助情况'].apply(func) d[['助学金个数', '助学金金额', '国助类型']] = d['tmp'].apply(pd.Series) d.drop(['在校受奖励资助情况', 'tmp'], axis=1, inplace=True) return d
[文档] @staticmethod def do_resident_type(s: pd.Series) -> pd.Series: """ 识别户口类型,缺失值视为城镇户口 Args: s:待处理的特征,pandas.Series Returns:返回pandas.Series """ d = s.fillna('城镇') return d.apply(lambda x: ('非' in x) ^ ('农' in x))
[文档] @staticmethod def do_household(s: pd.Series) -> pd.Series: """ 识别家庭人口数量,缺失则视为三口之家 Args: s:待处理的特征,pandas.Series Returns:返回pandas.Series """ d = s.fillna(3) to_be_replace = { '一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '人': '', '口': '' } def func(x): if isinstance(x, str): for k, v in to_be_replace.items(): x = x.replace(k, v) return int(x) return d.apply(func)
[文档] @staticmethod def do_loan(s: pd.Series): """ 识别是否贷款,缺失值视为无贷款 Notes: 只识别生源地和校园地助学贷款,其他贷款不在认定考虑范围内 Args: s:待处理的特征,pandas.Series Returns:返回pandas.Series """ res = s.fillna('-') yes_list = ['是', '√', '19500', '32000', '助学', '生源地', '校园', '扶贫', '学', '有', '贷款'] no_list = ['否', '无', '未', '-', '房', '50000', '10万', '借', '商业', '家', '父亲'] def fun(x): loan = False for i in yes_list: if i in x: loan = True break for j in no_list: if j in x: return False return loan return res.apply(fun)
[文档] @staticmethod def do_ethnic_group(s: pd.Series) -> pd.Series: """ 识别是否为少数民族,缺失值视为汉族 Args: s: 待处理的特征,pandas.Series Returns:返回pandas.Series """ d = s.fillna('汉') return d.apply(lambda x: False if '汉' in x else True)
[文档] def generate_feature(self): """ 按顺序将原始特征转为可使用的特征,并将特征重命名为f1,f2,f3.... Returns:处理好的DataSet对象,特征名映射在features_name属性中 """ d = [ DataSet.do_nation_policy(self.features[['享受国家政策资助情况', '突发事件情况', '家庭主要经济来源']]), DataSet.do_income(self.features[['家庭主要经济来源', '家庭人均年收入']]), DataSet.do_education(self.features['家庭其他成员在受教育情况']), DataSet.do_accident(self.features['突发事件情况']), DataSet.do_scholarship(self.features['在校受奖励资助情况']), DataSet.do_ethnic_group(self.features['民族']), DataSet.do_household(self.features['家庭人口']), DataSet.do_loan(self.features['是否贷款']), DataSet.do_resident_type(self.features['入学前户口性质']) ] new_f = pd.concat(d, axis=1, copy=False) new_f['父母均下岗'] |= new_f['父母均无业'] new_f['父母一方下岗'] |= new_f['父亲(母亲)无业'] new_f.drop(['父母均无业', '父亲(母亲)无业'], axis=1, inplace=True) self.features = new_f self.features_name = {'f' + str(i): x for i, x in enumerate(self.features.columns)} self.features.columns = self.features_name.keys() return
[文档] @staticmethod def data_augment(n: int = 1000, filename: str = None): """数据增强 一般数据集中没有非经济困难的,但是这样的模型并不够鲁棒, 所以我们需要按照一定规则生成非困难的样本,增强数据 Notes: 收入肯定不是正态分布的,但是想不好用什么,暂时采用对数正态,然后把低保线设在5%分位数 Args: n: 生成的数据条数 filename: 生成数据的保存路径 Returns: DataSet对象 """ d = DataSet() d.features_name = { 'f0': '建档立卡贫困户', 'f1': '城乡低保户', 'f2': '五保户', 'f3': '孤残学生', 'f4': '军烈属或优抚子女', 'f5': '经商', 'f6': '务农', 'f7': '退休', 'f8': '低保', 'f9': '打工', 'f10': '父母均下岗', 'f11': '父母一方下岗', 'f12': '家庭人均年收入', 'f13': '大学', 'f14': '高中', 'f15': '义务教育', 'f16': '祖父母患病', 'f17': '父母离异', 'f18': '父亲(母亲)患普通疾病', 'f19': '父母患普通疾病', 'f20': '兄弟姐妹患重疾', 'f21': '父亲(母亲)患重疾', 'f22': '父母患重疾', 'f23': '父亲(母亲)去世', 'f24': '突发重大自然灾害', 'f25': '助学金个数', 'f26': '助学金金额', 'f27': '国助类型', 'f28': '民族', 'f29': '家庭人口', 'f30': '是否贷款', 'f31': '入学前户口性质' } d.label = pd.Series([2] * n) d.strong_label = pd.Series([-1] * n) f = pd.DataFrame() for i in ['f0', 'f1', 'f2', 'f3', 'f8', 'f25', 'f26', 'f27', 'f30']: f[i] = pd.Series(zeros(n, dtype='int32')) f['f4'] = pd.Series(random.binomial(1, 0.002, n)) f['f10'] = pd.Series(random.binomial(1, 0.002, n)) f['f11'] = pd.Series(random.binomial(1, 0.02, n)) f['f12'] = pd.Series(random.lognormal(10.1811, 0.1892, n)) f['f13'] = pd.Series(random.binomial(3, 0.01, n)) f['f14'] = pd.Series(random.binomial(3, 0.01, n)) f['f15'] = pd.Series(random.binomial(3, 0.035, n)) f['f16'] = pd.Series(random.binomial(1, 0.15, n)) f['f17'] = pd.Series(random.binomial(1, 0.01, n)) f['f18'] = pd.Series(random.binomial(1, 0.01, n)) f['f19'] = pd.Series(random.binomial(1, 0.05, n)) f['f20'] = pd.Series(random.binomial(1, 0.003, n)) f['f21'] = pd.Series(random.binomial(1, 0.008, n)) f['f22'] = pd.Series(random.binomial(1, 0.00036, n)) f['f23'] = pd.Series(random.binomial(1, 0.008, n)) f['f24'] = pd.Series(random.binomial(1, 0.01, n)) f['f28'] = pd.Series(random.binomial(1, 0.05, n)) f['f29'] = pd.Series(random.binomial(7, 0.4, n)) f['f31'] = pd.Series(random.binomial(1, 0.1, n)) f['source'] = f['f31'].apply(lambda x: random.random_integers(1, 15 if x else 7)) f['f5'] = f['source'].apply(lambda x: x & 1) f['f7'] = f['source'].apply(lambda x: (x >> 1) & 1) f['f9'] = f['source'].apply(lambda x: (x >> 2) & 1) f['f6'] = f['source'].apply(lambda x: (x >> 3) & 1) f.drop('source', axis=1, inplace=True) d.features = f return d