文章
手机号 固话数据清洗
python实现
import re
qh_list = ['00852', '00853', '010', '020', '021', '022', '023', '024', '025', '027', '028', '029', '0310', '0311',
'0312', '0313', '0314', '0315', '0316', '0317', '0318', '0319', '0335', '0349', '0350', '0351', '0352',
'0353', '0354', '0355', '0356', '0357', '0358', '0359', '0370', '0371', '0372', '0373', '0374', '0375',
'0376', '0377', '0379', '0391', '0392', '0393', '0394', '0395', '0396', '0398', '0411', '0412', '0415',
'0416', '0417', '0418', '0419', '0421', '0427', '0429', '0431', '0432', '0433', '0434', '0435', '0436',
'0437', '0438', '0439', '0451', '0452', '0453', '0454', '0455', '0456', '0457', '0458', '0459', '0464',
'0467', '0468', '0469', '0470', '0471', '0472', '0473', '0474', '0475', '0476', '0477', '0478', '0479',
'0482', '0483', '0510', '0511', '0512', '0513', '0514', '0515', '0516', '0517', '0518', '0519', '0523',
'0527', '0530', '0531', '0532', '0533', '0534', '0535', '0536', '0537', '0538', '0539', '0543', '0546',
'0550', '0551', '0552', '0553', '0554', '0555', '0556', '0557', '0558', '0559', '0561', '0562', '0563',
'0564', '0566', '0570', '0571', '0572', '0573', '0574', '0575', '0576', '0577', '0578', '0579', '0580',
'0591', '0592', '0593', '0594', '0595', '0596', '0597', '0598', '0599', '0631', '0632', '0633', '0635',
'0660', '0662', '0663', '0668', '0691', '0692', '0701', '0710', '0711', '0712', '0713', '0714', '0715',
'0716', '0717', '0718', '0719', '0722', '0724', '0730', '0731', '0734', '0735', '0736', '0737', '0738',
'0739', '0743', '0744', '0745', '0746', '0750', '0751', '0752', '0753', '0754', '0755', '0756', '0757',
'0758', '0759', '0760', '0762', '0763', '0766', '0768', '0769', '0770', '0771', '0772', '0773', '0774',
'0775', '0776', '0777', '0778', '0779', '0790', '0791', '0792', '0793', '0794', '0795', '0796', '0797',
'0798', '0799', '0812', '0813', '0816', '0817', '0818', '0825', '0826', '0827', '0830', '0831', '0832',
'0833', '0834', '0835', '0836', '0837', '0838', '0839', '0851', '0854', '0855', '0856', '0857', '0858',
'0859', '0870', '0871', '0872', '0873', '0874', '0875', '0876', '0877', '0878', '0879', '0883', '0886',
'0887', '0888', '0891', '0892', '0893', '0894', '0895', '0896', '0897', '0898', '0901', '0902', '0903',
'0906', '0908', '0909', '0911', '0912', '0913', '0914', '0915', '0916', '0917', '0919', '0930', '0931',
'0932', '0933', '0934', '0935', '0936', '0937', '0938', '0939', '0941', '0943', '0951', '0952', '0953',
'0954', '0955', '0970', '0971', '0972', '0973', '0974', '0975', '0976', '0977', '0990', '0991', '0994',
'0995', '0996', '0997', '0998', '0999']
qh_with_out_0_list = []
for qh in qh_list:
qh_with_out_0 = re.sub(r"^0+", "", qh)
qh_with_out_0_list.append(qh_with_out_0)
qh_dict = dict(zip(qh_list, qh_list))
qh_with_out_0_dict = dict(zip(qh_with_out_0_list, qh_list))
# contact_info = " -0632 7752028-211、 "
# contact_info = " -06327752028-211、 "
# contact_info = " -7752028-211、 "
# contact_info = " -0632-7752028、 "
# contact_info = " -0632 7752028、 "
# contact_info = " -7752028 7735679、 "
# contact_info = " -6327752028 7735679、 "
# contact_info = " -632 7752028 7735679、 "
# contact_info = " -632-7752028 7735679、 "
# contact_info = " -06327752028、 "
# contact_info = " -632 7752028、 "
# contact_info = " -6327752028、 "
# contact_info = " -18366688607、 "
# contact_info = " -4000000123、 "
# contact_info = " -400-0000-123、 "
# contact_info = " -400 0000 123、 "
# contact_info = " -4000-000-123、 "
# contact_info = " -10 00011123、 "
# contact_info = " +86.05311234567、 "
# contact_info = " +86 010-000111-123 "
# contact_info = " 86-539-6096806 6261846 "
# contact_info = " 86-535-6380599-8057、 "
contact_info = " 86-0633-15265889042、 "
# contact_info = "86-0632-4415911不方便接"
# contact_info = "86-0631-5583367 0631-5583367"
# contact_info = "7654321@163.com"
# contact_info = "86-0536--6763199 解晓强 (总经理)"
print(contact_info)
# 去掉字符串两端空格
contact_info = contact_info.strip().replace("-", '-')
# 去掉字符串两端特殊字符
contact_info = re.sub(r"^\D|\D$", "", contact_info)
# 替换字符串中间空格为 -
contact_info = re.sub(r"\s+", "-", contact_info)
# 固话的几种格式
def parse_tel_info(contact_info):
items = {
"qh": None,
"tel": None,
"fj": None
}
for item in contact_info:
if item not in [str(i) for i in range(10)] + ['-', ' ']:
return items
if contact_info.isdigit():
# 如果电话格式中没有 - 都是数字 则提取里面的区号电话号码
qh_tel = parse_qh_tel(contact_info)
items.update(qh_tel)
else:
# 匹配: 区号(可选)-固话 - 分机(可选) 格式电话
type_1_re = re.search(r"((?P<qh>\d{2,4})-?)?(?P<tel>\d{7,8})(-(?P<fj>\d+))?", contact_info)
if type_1_re is not None:
qh = type_1_re.group("qh")
tel = type_1_re.group("tel")
fj = type_1_re.group("fj")
# 固话没有以0、1开始的
if tel is not None and tel[0] not in ['0', '1']:
items["tel"] = tel
if qh is not None:
if not qh.startswith("0"):
qh = qh_with_out_0_dict.get(qh)
else:
qh = qh_dict.get(qh)
if qh is not None:
items["qh"] = qh
if fj is not None and 1 <= len(fj) <= 4 and fj not in qh_dict.keys():
items["fj"] = fj
elif len(contact_info) >= 15:
parts = contact_info.split("-")
part = parts[0]
# 如果part是区号
if len(part) <= 4:
# 拼接后面的固话 要求固话长度在7、8
if len(parts[1]) in (7, 8):
part += parts[1]
# 拼接区号 固话后 判断是不是都是数字
if part.isdigit():
# 然后提取里面的区号 电话号码
qh_tel = parse_qh_tel(part)
items.update(qh_tel)
return items
def parse_qh_tel(contact_info):
items = dict()
# 正常固话7-8位 算上区号 以及区号前面不带0则长度在 7+2~8+4之前
if 9 <= len(contact_info) <= 12:
# 区号不是0开头
with_out_0 = False
if contact_info.startswith("0"):
qh_len3 = qh_dict.get(contact_info[:3])
qh_len4 = qh_dict.get(contact_info[:4])
qh = qh_len3 if qh_len3 is not None else qh_len4
else:
# 区号不带0开头
# 不带0开头这里获取map中的value会补零
qh_len2 = qh_with_out_0_dict.get(contact_info[:2])
qh_len3 = qh_with_out_0_dict.get(contact_info[:3])
qh = qh_len2 if qh_len2 is not None else qh_len3
with_out_0 = True
if qh is not None:
if with_out_0:
# qh补零加了1位 这里获取固话的时候需要 -1位
tel = contact_info[len(qh) - 1:]
else:
tel = contact_info[len(qh):]
items["qh"] = qh
items["tel"] = tel
else:
# 判断不带区号的电话是不是满足7-8位
if len(contact_info) in (7, 8):
items["tel"] = contact_info
return items
def parse_400tel_info(contact_info):
items = {
"qh": None,
"tel": None,
"fj": None
}
contact_info = contact_info.replace("-", "")
# 400电话长度必须10位
if len(contact_info) == 10:
items["tel"] = contact_info
return items
if contact_info.startswith("400"):
res = parse_400tel_info(contact_info)
else:
res = parse_tel_info(contact_info)
print(res)
print("-".join([item for item in list(res.values()) if item]))
java实现
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CleanContactInfo {
static String[] qh_list = new String[]{"00852", "00853", "010", "020", "021", "022", "023", "024", "025", "027", "028", "029", "0310", "0311", "0312", "0313", "0314", "0315", "0316", "0317", "0318", "0319", "0335", "0349", "0350", "0351", "0352", "0353", "0354", "0355", "0356", "0357", "0358", "0359", "0370", "0371", "0372", "0373", "0374", "0375", "0376", "0377", "0379", "0391", "0392", "0393", "0394", "0395", "0396", "0398", "0411", "0412", "0415", "0416", "0417", "0418", "0419", "0421", "0427", "0429", "0431", "0432", "0433", "0434", "0435", "0436", "0437", "0438", "0439", "0451", "0452", "0453", "0454", "0455", "0456", "0457", "0458", "0459", "0464", "0467", "0468", "0469", "0470", "0471", "0472", "0473", "0474", "0475", "0476", "0477", "0478", "0479", "0482", "0483", "0510", "0511", "0512", "0513", "0514", "0515", "0516", "0517", "0518", "0519", "0523", "0527", "0530", "0531", "0532", "0533", "0534", "0535", "0536", "0537", "0538", "0539", "0543", "0546", "0550", "0551", "0552", "0553", "0554", "0555", "0556", "0557", "0558", "0559", "0561", "0562", "0563", "0564", "0566", "0570", "0571", "0572", "0573", "0574", "0575", "0576", "0577", "0578", "0579", "0580", "0591", "0592", "0593", "0594", "0595", "0596", "0597", "0598", "0599", "0631", "0632", "0633", "0635", "0660", "0662", "0663", "0668", "0691", "0692", "0701", "0710", "0711", "0712", "0713", "0714", "0715", "0716", "0717", "0718", "0719", "0722", "0724", "0730", "0731", "0734", "0735", "0736", "0737", "0738", "0739", "0743", "0744", "0745", "0746", "0750", "0751", "0752", "0753", "0754", "0755", "0756", "0757", "0758", "0759", "0760", "0762", "0763", "0766", "0768", "0769", "0770", "0771", "0772", "0773", "0774", "0775", "0776", "0777", "0778", "0779", "0790", "0791", "0792", "0793", "0794", "0795", "0796", "0797", "0798", "0799", "0812", "0813", "0816", "0817", "0818", "0825", "0826", "0827", "0830", "0831", "0832", "0833", "0834", "0835", "0836", "0837", "0838", "0839", "0851", "0854", "0855", "0856", "0857", "0858", "0859", "0870", "0871", "0872", "0873", "0874", "0875", "0876", "0877", "0878", "0879", "0883", "0886", "0887", "0888", "0891", "0892", "0893", "0894", "0895", "0896", "0897", "0898", "0901", "0902", "0903", "0906", "0908", "0909", "0911", "0912", "0913", "0914", "0915", "0916", "0917", "0919", "0930", "0931", "0932", "0933", "0934", "0935", "0936", "0937", "0938", "0939", "0941", "0943", "0951", "0952", "0953", "0954", "0955", "0970", "0971", "0972", "0973", "0974", "0975", "0976", "0977", "0990", "0991", "0994", "0995", "0996", "0997", "0998", "0999"};
static HashMap<String, String> areaCodeMap = new HashMap<>();
static HashMap<String, String> areaCodeWithOut0Map = new HashMap<>();
static {
for (String qh : qh_list) {
String qhWithOut0 = qh.replaceFirst("^0*", "");
areaCodeMap.put(qh, qh);
areaCodeWithOut0Map.put(qhWithOut0, qh);
}
}
static List<String> contactTypes = Arrays.asList("1", "2", "3", "4");
public static String clean(String contactType, String contactInfo) {
if (contactInfo.trim().isEmpty()) {
return "";
}
String contactInfoEtl = "";
// 去除空格 中文- 替换英文 -
contactInfo = contactInfo.trim().replaceAll("-", "-");
// 去除字符串两端不是数字 不是字母的特殊字符
contactInfo = contactInfo.replaceAll("(^[^A-Za-z0-9]+)|([^A-Za-z0-9]+$)", "");
switch (contactType) {
case "1":
contactInfoEtl = processContactType1(contactInfo);
break;
case "2":
contactInfoEtl = processContactType2(contactInfo);
break;
case "3":
contactInfoEtl = processContactType3(contactInfo);
break;
case "4":
contactInfoEtl = processContactType4(contactInfo);
break;
default:
contactInfoEtl = contactInfo;
break;
}
return contactInfoEtl;
}
public static String processContactType4(String contactInfo) {
// 清洗微信号
return contactInfo;
}
public static String processContactType3(String contactInfo) {
// 清洗邮箱
String regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}";
return getRegexResult(regex, contactInfo);
}
public static String processContactType2(String contactInfo) {
// 判断手机号格式是否符合要求 + 0-8
String match = getRegexResult("^[\\d-]+$", contactInfo);
if ("".equals(match)) {
return "";
}
// 去除两端除数字之外的字符
contactInfo = contactInfo.replaceAll("^\\D+|\\D+$", "");
// 替换字符串中间的空格为-
contactInfo = contactInfo.replaceAll("\\s+", "-");
// 清洗固话
HashMap<String, String> res;
if (contactInfo.startsWith("400")) {
res = parse400TelInfo(contactInfo);
} else {
res = parseTelInfo(contactInfo);
}
ArrayList<String> parts = new ArrayList<>();
if (res.get("qh") != null) {
parts.add(res.get("qh"));
}
;
if (res.get("tel") != null) {
parts.add(res.get("tel"));
}
;
if (res.get("fj") != null) {
parts.add(res.get("fj"));
}
;
return String.join("-", parts);
}
public static String processContactType1(String contactInfo) {
// 判断手机号格式是否符合要求 + 0-8
String match = getRegexResult("^[\\d+\\s]+$", contactInfo);
if ("".equals(match)) {
return "";
}
// 去除字符串中间的空格
contactInfo = contactInfo.replaceAll("\\s+", "");
// 清洗手机号码
String regex = "1[3-9]\\d{9}";
return getRegexResult(regex, contactInfo);
}
/**
* 获取固话中的区号、电话号码、分机号码
*
* @param contactInfo
* @return
*/
public static HashMap<String, String> parseTelInfo(String contactInfo) {
HashMap<String, String> items = new HashMap<>();
items.put("qh", null);
items.put("tel", null);
items.put("fj", null);
if (isNumeric(contactInfo)) {
// 如果电话格式中没有 - 都是数字 则提取里面的区号电话号码
HashMap<String, String> qhTelMap = parseQhTel(contactInfo);
items.putAll(qhTelMap);
} else {
// 匹配: 区号(可选)-固话-分机(可选) 格式电话
String regex = "((?<qh>\\d{2,4})-?)?(?<tel>\\d{7,8})(-(?<fj>\\d+))?";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(contactInfo);
if (matcher.find()) {
String qh = matcher.group("qh");
String tel = matcher.group("tel");
String fj = matcher.group("fj");
List<String> wrong_tel_start_list = Arrays.asList("0", "1");
// 固话不能是0或1开头
if (tel != null && !wrong_tel_start_list.contains(tel.substring(0, 1))) {
items.put("tel", tel);
if (qh != null) {
if (!qh.startsWith("0")) {
qh = areaCodeWithOut0Map.get(qh);
} else {
qh = areaCodeMap.get(qh);
}
if (qh != null) {
items.put("qh", qh);
}
}
// fj号长度1-4 且不是区号
if (fj != null && fj.length() >= 1 && fj.length() <= 4 && !areaCodeMap.containsKey(fj)) {
items.put("fj", fj);
}
}
} else if (contactInfo.length() >= 15) {
String[] parts = contactInfo.split("-");
String part = parts[0];
// 如果 part是区号
if (part.length() <= 4) {
// 拼接 后面的固话 要求固话长度大于等于7小于等于8位
if (parts[1].length() >= 7 && parts[1].length() <= 8) {
part += parts[1];
}
}
// 拼接区号 固话后 判断是不是都是数字
if (isNumeric(part)) {
// 然后提取里面的区号 电话号码
HashMap<String, String> qhTelMap = parseQhTel(part);
items.putAll(qhTelMap);
}
}
}
return items;
}
/**
* 判断字符串是不是都是数字
*
* @param str
* @return
*/
public static boolean isNumeric(String str) {
if (str == null || str.length() == 0) {
return false;
}
return str.matches("\\d+");
}
/**
* 获取固话里面的区号和电话号码
* 区号必须符合国内电话号码规则
*
* @param contactInfo
* @return
*/
public static HashMap<String, String> parseQhTel(String contactInfo) {
HashMap<String, String> items = new HashMap<>();
// 正常固话7-8位 算上区号 以及区号前面不带0则长度在 7+2~8+4之前
if (contactInfo.length() >= 9 && contactInfo.length() <= 12) {
String qh;
// 区号不是0开头
boolean qhWithOut0 = false;
if (contactInfo.startsWith("0")) {
String qhLength3 = areaCodeMap.get(contactInfo.substring(0, 3));
String qhLength4 = areaCodeMap.get(contactInfo.substring(0, 4));
qh = qhLength3 == null ? qhLength4 : qhLength3;
} else {
// 区号不带0开头
// 不带0开头这里获取map中的value会补零
String qhLength2 = areaCodeWithOut0Map.get(contactInfo.substring(0, 2));
String qhLength3 = areaCodeWithOut0Map.get(contactInfo.substring(0, 3));
qh = qhLength2 == null ? qhLength3 : qhLength2;
qhWithOut0 = true;
}
if (qh != null) {
String tel;
// qh补零加了1位 这里获取固话的时候需要 -1位
if (qhWithOut0) {
tel = contactInfo.substring(qh.length() - 1);
} else {
tel = contactInfo.substring(qh.length());
}
items.put("qh", qh);
items.put("tel", tel);
}
} else {
// 判断不带区号的电话是不是满足 7-8位
if (contactInfo.length() >= 7 && contactInfo.length() <= 8) {
items.put("tel", contactInfo);
}
}
return items;
}
/**
* 400电话去掉 -
*
* @param contactInfo
* @return
*/
public static HashMap<String, String> parse400TelInfo(String contactInfo) {
HashMap<String, String> items = new HashMap<>();
contactInfo = contactInfo.replaceAll("\\-", "");
// 400电话长度必须10位
if (contactInfo.length() == 10) {
items.put("tel", contactInfo);
}
return items;
}
public static String getRegexResult(String regex, String content) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
return matcher.group();
} else {
return "";
}
}
}