1. 程式人生 > >Python string 去掉標點符號 最佳實踐

Python string 去掉標點符號 最佳實踐

Python 字串去掉標點符號最佳實踐

方法一:

str.isalnum
S.isalnum() -> bool
Return True if all characters in S are alphanumeric and there is at least one character in S, False otherwise.

>>> string = "Special $#! characters   spaces 888323"
>>> ''.join(e for e in string if e.isalnum())
'Specialcharactersspaces888323'

特點:

  • 只能識別字母和數字,殺傷力大,會把中文、空格之類的也幹掉

方法二:

string.punctuation

import re, string

s ="string. With. Punctuation?" # Sample string 

# 寫法一:
out = s.translate(string.maketrans("",""), string.punctuation)

# 寫法二:
out = s.translate(None, string.punctuation)

# 寫法三:
exclude = set(string.punctuation)
out =
''.join(ch for ch in s if ch not in exclude) # 寫法四: >>> for c in string.punctuation: s = s.replace(c,"") >>> s 'string With Punctuation' # 寫法五: out = re.sub('[%s]' % re.escape(string.punctuation), '', s) ## re.escape:對字串中所有可能被解釋為正則運算子的字元進行轉義 # 寫法六: # string.punctuation 只包括 ascii 格式; 想要一個包含更廣(但是更慢)的方法是使用: unicodedata module :
from unicodedata import category s = u'String — with - «Punctuation »...' out = re.sub('[%s]' % re.escape(string.punctuation), '', s) print 'Stripped', out # 輸出:u'Stripped String \u2014 with \xabPunctuation \xbb' out = ''.join(ch for ch in s if category(ch)[0] != 'P') print 'Stripped', out # 輸出:u'Stripped String with Punctuation ' # For Python 3 str or Python 2 unicode values, str.translate() only takes a dictionary; codepoints (integers) are looked up in that mapping and anything mapped to None is removed. # To remove (some?) punctuation then, use: import string remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) s.translate(remove_punct_map) # Your method doesn't work in Python 3, as the translate method doesn't accept the second argument any more. import unicodedata import sys tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) def remove_punctuation(text): return text.translate(tbl)

方法三:

re

import re
s ="string. With. Punctuation?"
s = re.sub(r'[^\w\s]','',s)

測試:

import re, string, timeit

s ="string. With. Punctuation"

exclude = set(string.punctuation)
table = string.maketrans("","")
regex = re.compile('[%s]' % re.escape(string.punctuation))

def test_set(s):
	return ''.join(ch for ch in s if ch not in exclude)

def test_re(s): 
	return regex.sub('', s)

def test_trans(s):
	return s.translate(table, string.punctuation)

def test_repl(s):
	for c in string.punctuation:
		s=s.replace(c,"")
	return s

print"sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000)
print"regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000)
print"translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000)
print"replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000)

out_put:
# sets : 19.8566138744
# regex : 6.86155414581
# translate : 2.12455511093
# replace : 28.4436721802