url 패턴 제거

    [NLP]. 텍스트 데이터 정제(이모지 , 특수문자, url , 한자 제거)

    import re import emoji from soynlp.normalizer import repeat_normalize pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-ㅣ가-힣]+') url_pattern = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)') def clean(x): x = pattern.sub(' ', x) x = emoji.replace_emoji(x, replace='') #emoji 삭제 x = url_pattern.sub('', x) x = x.strip() x = repe..