世上多的是这号人:放下前头的好景致不看,干脆转过身来,一肚子气,总认为过去的都好,如此耽误时光,空耗了力气靠回忆过日子,苦瓜当饭,黄连煮汤,以为是天下第一味道。几十年过去了,年轻时代那些漂亮的女孩子都已成为漂亮的外婆和祖母。我希望大家都不要难过,人生就是按照诗的安排过下来的。我的经验是,碰到任何困难都要赶快往前走,不要欣赏让你摔倒的那个坑。
黄永玉,《别欣赏让你摔倒的那个坑》
Regex
import re
bat = re.compile(r'Bat(wo)?man')
# ()? 是否存在,0 or 1
# ()* 可以存在多次,0 or more
# ()+ 至少出现一次 1 or more
# (){2,5} 精确出现几次到几次
# (){2,} 至少两次
# (\d){3,5} 五个数字
rat = re.compile(r'Dinner\?\*\+')
# 搜索 "Dinner?"
All matches
phone = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
text = '''Long texts with lots of numbers'''
phone.findall(text)
# 这里会返回一个列表
Multiple groups
phone = re.compile(r'(\d\d\d)-(\d\d\d\-\d\d\d\d)') # - 号把两组识别隔开
# return [('508','555-5555'),('508','555-1234')]
phone = re.compile(r'((\d\d\d)-(\d\d\d\-\d\d\d\d))') # 外加的一个括号变成了三组识别
# return ['508-555-555',508','555-5555'),('508-555-1234',508','555-1234')]
Character class
# \d Any numeric digit from 0 to 9
# \D Any character that is not a numeric digit from 0 to 9
# \w Any letter, numeric digit, or the underscore character.
# \W Any character that is not a letter, numeric digit, or the underscore character.
# \s Any space, tab, or newline character.
# \S Any character that is not a space, tab, or newline.
lyrics = '12 drummers drumming, 11 pipers piping, 10 lords a leaping, 9 ladies dancing.'
xmas = re.compile(r'\d+\s\w+')
xmas.findall(lyrics)
# return '12 drummers' '11 pipers' '10 lords' '9 ladies'
vowel = re.compile(r'[aeiouAEIOU]')
# character one by one.
vowel.findall('robocop eats baby food.')
# return ['o','o','o','e','a','a','o','o']
doubleVowel = re.compile(r'[aeiouAEIOU]{2}')
doubleVowel.findall('Robocop eats baby food')
# return ['ea','oo']
consonantsRegex = re.compile(r'[^aeiouAEIOU]')
consonantsRegex.findall('robocop eats baby food.')
# return ['R','b','c','p',' ','t','s',' ','b','b','y',' ','f','d','.']
# ^ 旨在匹配字符串里没有的字符
Dot-Star and the Caret Characters
beginsWithHelloRegex = re.compile(r'^Hello') # 开头
endsWithWorldRegex = re.compile(r'world!$') # 结尾
allDigitsRegex = re.compile(r'^\d+$') # 头和尾都要是数字
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')
atRegex = re.compile(r'.{1,2}at') # 一或者两个字母在 at 前面
nameRegex = re.compile(r'First Name: ( *) Last Name: (.*)')
nameRegex.findall('First Name: AI Last Name: Sweigart') # [('AI','Sweigart')]
nongreedy = re.compile(r'<(.*?)>') # <> 里的一切
greedy = re.compile(r'.*',re.DOTALL) # .(匹配一切) 无法出现换行后的内容,后面加 RE.DOTALL 即可
# second argument 加入 re.I 让系统不再对大小写敏感
Sub() and Verbose method
# sub 函数用于批量替换
names = re.compile(r'Agent \w+')
names.sub("Shabi","Agent give Alice a box.") #Shabi give Shabi a box.
# re.VERBOSE 复杂的语法有时候非常长,加上这个可以换行写,还能加备注
re.compile(r'''
(\d\d\d-)| #area code
-
\d\d\d # first dash
-
\d\d\d\d #last 4 digits''',re.VERBOSE)
# Use multiple options
(,re.VERBOSE | re.DOTALL | re.I)
Copy from a pdf
# 从 PDF 批量复制邮件和电话号码
import re,pyperclip
# Phone numbers
phone = re.compile(r'''
(
((\d\d\d)|(\(\d\d\d\)))? # area code
(\s|-) # first separator
\d\d\d
- # separator
\d\d\d\d # last 4 digits
(((ext(\.)?\s)|x) # extension word-part (optional)
(\d{2,5}))? # extension number-part
)
''',re.VERBOSE)
# Email
email = re.compile(r'''
[a-zA-Z0-9_.+]+ # name part
@ # @ symbol
[a-zA-Z0-9_.+]+# domain name part
''',re.VERBOSE)
# Get the text off the clipboard
text = pyperclip.paste()
# todo: Extract the email/phone from this list
extractedPhone = phone.findall(text)
extractedEmail = email.findall(text)
allPhoneNumbers = []
for phoneNumber in extractedPhone:
allPhoneNumbers.append(phoneNumber[0])
results = '\n'.join(allPhoneNumbers) + '\n' + '\n'.join(extractedEmail)
pyperclip.copy(results)
Helper