世上多的是这号人:放下前头的好景致不看,干脆转过身来,一肚子气,总认为过去的都好,如此耽误时光,空耗了力气靠回忆过日子,苦瓜当饭,黄连煮汤,以为是天下第一味道。几十年过去了,年轻时代那些漂亮的女孩子都已成为漂亮的外婆和祖母。我希望大家都不要难过,人生就是按照诗的安排过下来的。我的经验是,碰到任何困难都要赶快往前走,不要欣赏让你摔倒的那个坑。
黄永玉,《别欣赏让你摔倒的那个坑》

Regex

import re
bat = re.compile(r'Bat(wo)?man') 
# ()? 是否存在,0 or 1

# ()* 可以存在多次,0 or more

# ()+ 至少出现一次 1 or more

# (){2,5} 精确出现几次到几次

# (){2,} 至少两次

# (\d){3,5} 五个数字

rat = re.compile(r'Dinner\?\*\+') 
# 搜索 "Dinner?" 

All matches

phone = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
text = '''Long texts with lots of numbers'''
phone.findall(text)
# 这里会返回一个列表

Multiple groups

phone = re.compile(r'(\d\d\d)-(\d\d\d\-\d\d\d\d)') # - 号把两组识别隔开

# return [('508','555-5555'),('508','555-1234')]

phone = re.compile(r'((\d\d\d)-(\d\d\d\-\d\d\d\d))') # 外加的一个括号变成了三组识别

# return ['508-555-555',508','555-5555'),('508-555-1234',508','555-1234')]

Character class

# \d Any numeric digit from 0 to 9

# \D Any character that is not a numeric digit from 0 to 9

# \w Any letter, numeric digit, or the underscore character. 

# \W Any character that is not a letter, numeric digit, or the underscore character.

# \s Any space, tab, or newline character.

# \S Any character that is not a space, tab, or newline.


lyrics = '12 drummers drumming, 11 pipers piping, 10 lords a leaping, 9 ladies dancing.'
xmas = re.compile(r'\d+\s\w+')
xmas.findall(lyrics)
# return '12 drummers' '11 pipers' '10 lords' '9 ladies'


vowel = re.compile(r'[aeiouAEIOU]') 
# character one by one.

vowel.findall('robocop eats baby food.')
# return ['o','o','o','e','a','a','o','o']

doubleVowel = re.compile(r'[aeiouAEIOU]{2}')
doubleVowel.findall('Robocop eats baby food')
# return ['ea','oo']


consonantsRegex = re.compile(r'[^aeiouAEIOU]')
consonantsRegex.findall('robocop eats baby food.')
# return ['R','b','c','p',' ','t','s',' ','b','b','y',' ','f','d','.']

# ^ 旨在匹配字符串里没有的字符

Dot-Star and the Caret Characters

beginsWithHelloRegex = re.compile(r'^Hello') # 开头


endsWithWorldRegex = re.compile(r'world!$') # 结尾


allDigitsRegex = re.compile(r'^\d+$') # 头和尾都要是数字


atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.') 
atRegex = re.compile(r'.{1,2}at') # 一或者两个字母在 at 前面


nameRegex = re.compile(r'First Name: ( *) Last Name: (.*)')
nameRegex.findall('First Name: AI Last Name: Sweigart') # [('AI','Sweigart')]


nongreedy = re.compile(r'<(.*?)>') # <> 里的一切

greedy = re.compile(r'.*',re.DOTALL) # .(匹配一切) 无法出现换行后的内容,后面加 RE.DOTALL 即可


# second argument 加入 re.I 让系统不再对大小写敏感

Sub() and Verbose method

# sub 函数用于批量替换

names = re.compile(r'Agent \w+')
names.sub("Shabi","Agent give Alice a box.") #Shabi give Shabi a box.

# re.VERBOSE 复杂的语法有时候非常长,加上这个可以换行写,还能加备注

re.compile(r'''
(\d\d\d-)| #area code
-
\d\d\d # first dash
-
\d\d\d\d #last 4 digits''',re.VERBOSE)

# Use multiple options

(,re.VERBOSE | re.DOTALL | re.I)

Copy from a pdf

# 从 PDF 批量复制邮件和电话号码

import re,pyperclip

# Phone numbers

phone = re.compile(r'''
(
((\d\d\d)|(\(\d\d\d\)))? # area code
(\s|-) # first separator
\d\d\d
- # separator
\d\d\d\d # last 4 digits
(((ext(\.)?\s)|x) # extension word-part (optional)
(\d{2,5}))? # extension number-part
)
''',re.VERBOSE)

# Email

email = re.compile(r'''
[a-zA-Z0-9_.+]+ # name part
@               # @ symbol
[a-zA-Z0-9_.+]+# domain name part
''',re.VERBOSE)

# Get the text off the clipboard

text = pyperclip.paste()

# todo: Extract the email/phone from this list

extractedPhone = phone.findall(text)
extractedEmail = email.findall(text)

allPhoneNumbers = []
for phoneNumber in extractedPhone:
    allPhoneNumbers.append(phoneNumber[0])

results = '\n'.join(allPhoneNumbers) + '\n' + '\n'.join(extractedEmail)
pyperclip.copy(results)

Helper