大数据

手机号 固话数据清洗

python实现

import re

qh_list = ['00852', '00853', '010', '020', '021', '022', '023', '024', '025', '027', '028', '029', '0310', '0311',
           '0312', '0313', '0314', '0315', '0316', '0317', '0318', '0319', '0335', '0349', '0350', '0351', '0352',
           '0353', '0354', '0355', '0356', '0357', '0358', '0359', '0370', '0371', '0372', '0373', '0374', '0375',
           '0376', '0377', '0379', '0391', '0392', '0393', '0394', '0395', '0396', '0398', '0411', '0412', '0415',
           '0416', '0417', '0418', '0419', '0421', '0427', '0429', '0431', '0432', '0433', '0434', '0435', '0436',
           '0437', '0438', '0439', '0451', '0452', '0453', '0454', '0455', '0456', '0457', '0458', '0459', '0464',
           '0467', '0468', '0469', '0470', '0471', '0472', '0473', '0474', '0475', '0476', '0477', '0478', '0479',
           '0482', '0483', '0510', '0511', '0512', '0513', '0514', '0515', '0516', '0517', '0518', '0519', '0523',
           '0527', '0530', '0531', '0532', '0533', '0534', '0535', '0536', '0537', '0538', '0539', '0543', '0546',
           '0550', '0551', '0552', '0553', '0554', '0555', '0556', '0557', '0558', '0559', '0561', '0562', '0563',
           '0564', '0566', '0570', '0571', '0572', '0573', '0574', '0575', '0576', '0577', '0578', '0579', '0580',
           '0591', '0592', '0593', '0594', '0595', '0596', '0597', '0598', '0599', '0631', '0632', '0633', '0635',
           '0660', '0662', '0663', '0668', '0691', '0692', '0701', '0710', '0711', '0712', '0713', '0714', '0715',
           '0716', '0717', '0718', '0719', '0722', '0724', '0730', '0731', '0734', '0735', '0736', '0737', '0738',
           '0739', '0743', '0744', '0745', '0746', '0750', '0751', '0752', '0753', '0754', '0755', '0756', '0757',
           '0758', '0759', '0760', '0762', '0763', '0766', '0768', '0769', '0770', '0771', '0772', '0773', '0774',
           '0775', '0776', '0777', '0778', '0779', '0790', '0791', '0792', '0793', '0794', '0795', '0796', '0797',
           '0798', '0799', '0812', '0813', '0816', '0817', '0818', '0825', '0826', '0827', '0830', '0831', '0832',
           '0833', '0834', '0835', '0836', '0837', '0838', '0839', '0851', '0854', '0855', '0856', '0857', '0858',
           '0859', '0870', '0871', '0872', '0873', '0874', '0875', '0876', '0877', '0878', '0879', '0883', '0886',
           '0887', '0888', '0891', '0892', '0893', '0894', '0895', '0896', '0897', '0898', '0901', '0902', '0903',
           '0906', '0908', '0909', '0911', '0912', '0913', '0914', '0915', '0916', '0917', '0919', '0930', '0931',
           '0932', '0933', '0934', '0935', '0936', '0937', '0938', '0939', '0941', '0943', '0951', '0952', '0953',
           '0954', '0955', '0970', '0971', '0972', '0973', '0974', '0975', '0976', '0977', '0990', '0991', '0994',
           '0995', '0996', '0997', '0998', '0999']
qh_with_out_0_list = []
for qh in qh_list:
    qh_with_out_0 = re.sub(r"^0+", "", qh)
    qh_with_out_0_list.append(qh_with_out_0)

qh_dict = dict(zip(qh_list, qh_list))
qh_with_out_0_dict = dict(zip(qh_with_out_0_list, qh_list))

# contact_info = " -0632 7752028-211、 "
# contact_info = " -06327752028-211、 "
# contact_info = " -7752028-211、 "
# contact_info = " -0632-7752028、 "
# contact_info = " -0632 7752028、 "
# contact_info = " -7752028 7735679、 "
# contact_info = " -6327752028 7735679、 "
# contact_info = " -632 7752028 7735679、 "
# contact_info = " -632-7752028 7735679、 "
# contact_info = " -06327752028、 "
# contact_info = " -632 7752028、 "
# contact_info = " -6327752028、 "
# contact_info = " -18366688607、 "
# contact_info = " -4000000123、 "
# contact_info = " -400-0000-123、 "
# contact_info = " -400 0000 123、 "
# contact_info = " -4000-000-123、 "
# contact_info = " -10 00011123、 "
# contact_info = " +86.05311234567、 "
# contact_info = " +86 010-000111-123 "
# contact_info = " 86-539-6096806 6261846 "
# contact_info = " 86-535-6380599-8057、 "
contact_info = " 86-0633-15265889042、 "
# contact_info = "86-0632-4415911不方便接"
# contact_info = "86-0631-5583367 0631-5583367"
# contact_info = "7654321@163.com"
# contact_info = "86-0536--6763199 解晓强 (总经理)"

print(contact_info)
# 去掉字符串两端空格
contact_info = contact_info.strip().replace("-", '-')
# 去掉字符串两端特殊字符
contact_info = re.sub(r"^\D|\D$", "", contact_info)
# 替换字符串中间空格为 -
contact_info = re.sub(r"\s+", "-", contact_info)


# 固话的几种格式
def parse_tel_info(contact_info):
    items = {
        "qh": None,
        "tel": None,
        "fj": None
    }

    for item in contact_info:
        if item not in [str(i) for i in range(10)] + ['-', ' ']:
            return items

    if contact_info.isdigit():
        # 如果电话格式中没有 - 都是数字 则提取里面的区号电话号码
        qh_tel = parse_qh_tel(contact_info)
        items.update(qh_tel)
    else:
        # 匹配: 区号(可选)-固话 - 分机(可选) 格式电话
        type_1_re = re.search(r"((?P<qh>\d{2,4})-?)?(?P<tel>\d{7,8})(-(?P<fj>\d+))?", contact_info)
        if type_1_re is not None:
            qh = type_1_re.group("qh")
            tel = type_1_re.group("tel")
            fj = type_1_re.group("fj")
            # 固话没有以0、1开始的
            if tel is not None and tel[0] not in ['0', '1']:
                items["tel"] = tel
                if qh is not None:
                    if not qh.startswith("0"):
                        qh = qh_with_out_0_dict.get(qh)
                    else:
                        qh = qh_dict.get(qh)
                    if qh is not None:
                        items["qh"] = qh
                if fj is not None and 1 <= len(fj) <= 4 and fj not in qh_dict.keys():
                    items["fj"] = fj
        elif len(contact_info) >= 15:
            parts = contact_info.split("-")
            part = parts[0]
            # 如果part是区号
            if len(part) <= 4:
                # 拼接后面的固话 要求固话长度在7、8
                if len(parts[1]) in (7, 8):
                    part += parts[1]
            # 拼接区号 固话后 判断是不是都是数字
            if part.isdigit():
                # 然后提取里面的区号 电话号码
                qh_tel = parse_qh_tel(part)
                items.update(qh_tel)
    return items


def parse_qh_tel(contact_info):
    items = dict()
    # 正常固话7-8位 算上区号 以及区号前面不带0则长度在 7+2~8+4之前
    if 9 <= len(contact_info) <= 12:
        # 区号不是0开头
        with_out_0 = False
        if contact_info.startswith("0"):
            qh_len3 = qh_dict.get(contact_info[:3])
            qh_len4 = qh_dict.get(contact_info[:4])
            qh = qh_len3 if qh_len3 is not None else qh_len4
        else:
            # 区号不带0开头
            # 不带0开头这里获取map中的value会补零
            qh_len2 = qh_with_out_0_dict.get(contact_info[:2])
            qh_len3 = qh_with_out_0_dict.get(contact_info[:3])
            qh = qh_len2 if qh_len2 is not None else qh_len3
            with_out_0 = True
        if qh is not None:
            if with_out_0:
                # qh补零加了1位 这里获取固话的时候需要 -1位
                tel = contact_info[len(qh) - 1:]
            else:
                tel = contact_info[len(qh):]
            items["qh"] = qh
            items["tel"] = tel
    else:
        # 判断不带区号的电话是不是满足7-8位
        if len(contact_info) in (7, 8):
            items["tel"] = contact_info
    return items


def parse_400tel_info(contact_info):
    items = {
        "qh": None,
        "tel": None,
        "fj": None
    }
    contact_info = contact_info.replace("-", "")
    # 400电话长度必须10位
    if len(contact_info) == 10:
        items["tel"] = contact_info
    return items


if contact_info.startswith("400"):
    res = parse_400tel_info(contact_info)
else:
    res = parse_tel_info(contact_info)
print(res)
print("-".join([item for item in list(res.values()) if item]))

java实现

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CleanContactInfo {
    static String[] qh_list = new String[]{"00852", "00853", "010", "020", "021", "022", "023", "024", "025", "027", "028", "029", "0310", "0311", "0312", "0313", "0314", "0315", "0316", "0317", "0318", "0319", "0335", "0349", "0350", "0351", "0352", "0353", "0354", "0355", "0356", "0357", "0358", "0359", "0370", "0371", "0372", "0373", "0374", "0375", "0376", "0377", "0379", "0391", "0392", "0393", "0394", "0395", "0396", "0398", "0411", "0412", "0415", "0416", "0417", "0418", "0419", "0421", "0427", "0429", "0431", "0432", "0433", "0434", "0435", "0436", "0437", "0438", "0439", "0451", "0452", "0453", "0454", "0455", "0456", "0457", "0458", "0459", "0464", "0467", "0468", "0469", "0470", "0471", "0472", "0473", "0474", "0475", "0476", "0477", "0478", "0479", "0482", "0483", "0510", "0511", "0512", "0513", "0514", "0515", "0516", "0517", "0518", "0519", "0523", "0527", "0530", "0531", "0532", "0533", "0534", "0535", "0536", "0537", "0538", "0539", "0543", "0546", "0550", "0551", "0552", "0553", "0554", "0555", "0556", "0557", "0558", "0559", "0561", "0562", "0563", "0564", "0566", "0570", "0571", "0572", "0573", "0574", "0575", "0576", "0577", "0578", "0579", "0580", "0591", "0592", "0593", "0594", "0595", "0596", "0597", "0598", "0599", "0631", "0632", "0633", "0635", "0660", "0662", "0663", "0668", "0691", "0692", "0701", "0710", "0711", "0712", "0713", "0714", "0715", "0716", "0717", "0718", "0719", "0722", "0724", "0730", "0731", "0734", "0735", "0736", "0737", "0738", "0739", "0743", "0744", "0745", "0746", "0750", "0751", "0752", "0753", "0754", "0755", "0756", "0757", "0758", "0759", "0760", "0762", "0763", "0766", "0768", "0769", "0770", "0771", "0772", "0773", "0774", "0775", "0776", "0777", "0778", "0779", "0790", "0791", "0792", "0793", "0794", "0795", "0796", "0797", "0798", "0799", "0812", "0813", "0816", "0817", "0818", "0825", "0826", "0827", "0830", "0831", "0832", "0833", "0834", "0835", "0836", "0837", "0838", "0839", "0851", "0854", "0855", "0856", "0857", "0858", "0859", "0870", "0871", "0872", "0873", "0874", "0875", "0876", "0877", "0878", "0879", "0883", "0886", "0887", "0888", "0891", "0892", "0893", "0894", "0895", "0896", "0897", "0898", "0901", "0902", "0903", "0906", "0908", "0909", "0911", "0912", "0913", "0914", "0915", "0916", "0917", "0919", "0930", "0931", "0932", "0933", "0934", "0935", "0936", "0937", "0938", "0939", "0941", "0943", "0951", "0952", "0953", "0954", "0955", "0970", "0971", "0972", "0973", "0974", "0975", "0976", "0977", "0990", "0991", "0994", "0995", "0996", "0997", "0998", "0999"};
    static HashMap<String, String> areaCodeMap = new HashMap<>();
    static HashMap<String, String> areaCodeWithOut0Map = new HashMap<>();

    static {
        for (String qh : qh_list) {
            String qhWithOut0 = qh.replaceFirst("^0*", "");
            areaCodeMap.put(qh, qh);
            areaCodeWithOut0Map.put(qhWithOut0, qh);
        }
    }


    static List<String> contactTypes = Arrays.asList("1", "2", "3", "4");

    public static String clean(String contactType, String contactInfo) {
        if (contactInfo.trim().isEmpty()) {
            return "";
        }
        String contactInfoEtl = "";
        // 去除空格 中文- 替换英文 -
        contactInfo = contactInfo.trim().replaceAll("-", "-");
        // 去除字符串两端不是数字 不是字母的特殊字符
        contactInfo = contactInfo.replaceAll("(^[^A-Za-z0-9]+)|([^A-Za-z0-9]+$)", "");

        switch (contactType) {
            case "1":
                contactInfoEtl = processContactType1(contactInfo);
                break;
            case "2":
                contactInfoEtl = processContactType2(contactInfo);
                break;
            case "3":
                contactInfoEtl = processContactType3(contactInfo);
                break;
            case "4":
                contactInfoEtl = processContactType4(contactInfo);
                break;
            default:
                contactInfoEtl = contactInfo;
                break;
        }
        return contactInfoEtl;
    }

    public static String processContactType4(String contactInfo) {
        // 清洗微信号
        return contactInfo;
    }

    public static String processContactType3(String contactInfo) {
        // 清洗邮箱
        String regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}";
        return getRegexResult(regex, contactInfo);
    }

    public static String processContactType2(String contactInfo) {
        // 判断手机号格式是否符合要求 + 0-8
        String match = getRegexResult("^[\\d-]+$", contactInfo);
        if ("".equals(match)) {
            return "";
        }
        // 去除两端除数字之外的字符
        contactInfo = contactInfo.replaceAll("^\\D+|\\D+$", "");
        // 替换字符串中间的空格为-
        contactInfo = contactInfo.replaceAll("\\s+", "-");
        // 清洗固话
        HashMap<String, String> res;
        if (contactInfo.startsWith("400")) {
            res = parse400TelInfo(contactInfo);
        } else {
            res = parseTelInfo(contactInfo);
        }
        ArrayList<String> parts = new ArrayList<>();
        if (res.get("qh") != null) {
            parts.add(res.get("qh"));
        }
        ;
        if (res.get("tel") != null) {
            parts.add(res.get("tel"));
        }
        ;
        if (res.get("fj") != null) {
            parts.add(res.get("fj"));
        }
        ;
        return String.join("-", parts);
    }

    public static String processContactType1(String contactInfo) {
        // 判断手机号格式是否符合要求 + 0-8
        String match = getRegexResult("^[\\d+\\s]+$", contactInfo);
        if ("".equals(match)) {
            return "";
        }
        // 去除字符串中间的空格
        contactInfo = contactInfo.replaceAll("\\s+", "");
        // 清洗手机号码
        String regex = "1[3-9]\\d{9}";
        return getRegexResult(regex, contactInfo);
    }

    /**
     * 获取固话中的区号、电话号码、分机号码
     *
     * @param contactInfo
     * @return
     */
    public static HashMap<String, String> parseTelInfo(String contactInfo) {
        HashMap<String, String> items = new HashMap<>();
        items.put("qh", null);
        items.put("tel", null);
        items.put("fj", null);

        if (isNumeric(contactInfo)) {
            // 如果电话格式中没有 - 都是数字 则提取里面的区号电话号码
            HashMap<String, String> qhTelMap = parseQhTel(contactInfo);
            items.putAll(qhTelMap);
        } else {
            // 匹配: 区号(可选)-固话-分机(可选) 格式电话
            String regex = "((?<qh>\\d{2,4})-?)?(?<tel>\\d{7,8})(-(?<fj>\\d+))?";
            Pattern pattern = Pattern.compile(regex);
            Matcher matcher = pattern.matcher(contactInfo);
            if (matcher.find()) {
                String qh = matcher.group("qh");
                String tel = matcher.group("tel");
                String fj = matcher.group("fj");
                List<String> wrong_tel_start_list = Arrays.asList("0", "1");
                // 固话不能是0或1开头
                if (tel != null && !wrong_tel_start_list.contains(tel.substring(0, 1))) {
                    items.put("tel", tel);
                    if (qh != null) {
                        if (!qh.startsWith("0")) {
                            qh = areaCodeWithOut0Map.get(qh);
                        } else {
                            qh = areaCodeMap.get(qh);
                        }
                        if (qh != null) {
                            items.put("qh", qh);
                        }
                    }
                    // fj号长度1-4 且不是区号
                    if (fj != null && fj.length() >= 1 && fj.length() <= 4 && !areaCodeMap.containsKey(fj)) {
                        items.put("fj", fj);
                    }
                }
            } else if (contactInfo.length() >= 15) {
                String[] parts = contactInfo.split("-");
                String part = parts[0];
                // 如果 part是区号
                if (part.length() <= 4) {
                    // 拼接 后面的固话 要求固话长度大于等于7小于等于8位
                    if (parts[1].length() >= 7 && parts[1].length() <= 8) {
                        part += parts[1];
                    }
                }
                // 拼接区号 固话后 判断是不是都是数字
                if (isNumeric(part)) {
                    //  然后提取里面的区号 电话号码
                    HashMap<String, String> qhTelMap = parseQhTel(part);
                    items.putAll(qhTelMap);
                }
            }
        }
        return items;
    }

    /**
     * 判断字符串是不是都是数字
     *
     * @param str
     * @return
     */
    public static boolean isNumeric(String str) {
        if (str == null || str.length() == 0) {
            return false;
        }
        return str.matches("\\d+");
    }

    /**
     * 获取固话里面的区号和电话号码
     * 区号必须符合国内电话号码规则
     *
     * @param contactInfo
     * @return
     */
    public static HashMap<String, String> parseQhTel(String contactInfo) {
        HashMap<String, String> items = new HashMap<>();
        // 正常固话7-8位 算上区号 以及区号前面不带0则长度在 7+2~8+4之前
        if (contactInfo.length() >= 9 && contactInfo.length() <= 12) {
            String qh;
            // 区号不是0开头
            boolean qhWithOut0 = false;
            if (contactInfo.startsWith("0")) {
                String qhLength3 = areaCodeMap.get(contactInfo.substring(0, 3));
                String qhLength4 = areaCodeMap.get(contactInfo.substring(0, 4));
                qh = qhLength3 == null ? qhLength4 : qhLength3;
            } else {
                // 区号不带0开头
                // 不带0开头这里获取map中的value会补零
                String qhLength2 = areaCodeWithOut0Map.get(contactInfo.substring(0, 2));
                String qhLength3 = areaCodeWithOut0Map.get(contactInfo.substring(0, 3));
                qh = qhLength2 == null ? qhLength3 : qhLength2;
                qhWithOut0 = true;
            }
            if (qh != null) {
                String tel;
                // qh补零加了1位 这里获取固话的时候需要 -1位
                if (qhWithOut0) {
                    tel = contactInfo.substring(qh.length() - 1);
                } else {
                    tel = contactInfo.substring(qh.length());
                }
                items.put("qh", qh);
                items.put("tel", tel);
            }
        } else {
            // 判断不带区号的电话是不是满足 7-8位
            if (contactInfo.length() >= 7 && contactInfo.length() <= 8) {
                items.put("tel", contactInfo);
            }
        }
        return items;
    }

    /**
     * 400电话去掉 -
     *
     * @param contactInfo
     * @return
     */
    public static HashMap<String, String> parse400TelInfo(String contactInfo) {
        HashMap<String, String> items = new HashMap<>();
        contactInfo = contactInfo.replaceAll("\\-", "");
        // 400电话长度必须10位
        if (contactInfo.length() == 10) {
            items.put("tel", contactInfo);
        }
        return items;
    }

    public static String getRegexResult(String regex, String content) {
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(content);

        if (matcher.find()) {
            return matcher.group();
        } else {
            return "";
        }
    }
}