lightjake
diff --git a/‎.idea/workspace.xml‎
Lines changed: 28 additions & 45 deletions b/‎.idea/workspace.xml‎
Lines changed: 28 additions & 45 deletions
diff --git a/‎regularExpression/.idea/misc.xml‎
Lines changed: 14 additions & 0 deletions b/‎regularExpression/.idea/misc.xml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎regularExpression/.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions b/‎regularExpression/.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎regularExpression/.idea/regularExpression.iml‎
Lines changed: 11 additions & 0 deletions b/‎regularExpression/.idea/regularExpression.iml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎regularExpression/.idea/workspace.xml‎
Lines changed: 466 additions & 0 deletions b/‎regularExpression/.idea/workspace.xml‎
Lines changed: 466 additions & 0 deletions
diff --git a/‎regularExpression/README.md‎
Lines changed: 70 additions & 0 deletions b/‎regularExpression/README.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎regularExpression/reTest.py‎
Lines changed: 41 additions & 0 deletions b/‎regularExpression/reTest.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎regularExpression/re模块其他方法.py‎
Lines changed: 39 additions & 0 deletions b/‎regularExpression/re模块其他方法.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎regularExpression/re练习.py‎
Lines changed: 41 additions & 0 deletions b/‎regularExpression/re练习.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎regularExpression/正则表达式语法.py‎
Lines changed: 87 additions & 0 deletions b/‎regularExpression/正则表达式语法.py‎
Lines changed: 87 additions & 0 deletions
@@ -0,0 +1,70 @@
+# 正则表达式
+1. 使用单个字符串来描述匹配一系列符合某个句法规则的字符串。
+2. 是对字符串操作的一种逻辑公式
+3. 应用场景：处理文本和数据
+4. 正则表达式过程：依次拿出表达式和文本中的字符比较，如果每一个字符都能匹配， 则匹配成功；否则匹配失败。
+
+## Python下使用正则表达式
+1. `import re`：正则表达式模块：Python通过re模块提供对正则表达式的支持。使用re的一般步骤是先将正则表达式的字符串形式编译为Pattern实例，然后使用Pattern实例处理文本并获得匹配结果（一个Match实例），最后使用Match实例获得信息，进行其他的操作。r代表进行匹配的是元字符串， 不使用元字符串则需要注意转译的情况。
+
+使用正则表达式匹配字符串开头是否为指定的字符或字符串：
+
+
+```
+import re
+str1 = 'test python'
+# 将正则表达式编译成pattern对象
+# 使用r'test', r代表进行匹配的是元字符串
+pa = re.compile(r'test')    # pa已经成为一个pattern实例
+print(type(pa))
+
+ma = pa.match(str1)         # 若匹配成功, ma成为一个match对象
+
+print(ma)
+print(ma.group())           # group()返回一个str或者tuple
+print(ma.span())            # 返回字符串的索引
+print(ma.re)                # pattern的实例
+```
+
+```
+# 返回结果
+<class '_sre.SRE_Pattern'>
+<_sre.SRE_Match object; span=(0, 4), match='test'>
+test
+(0, 4)
+re.compile('test')
+```
+
+## 正则表达式语法
+
+
+|字符|匹配|
+|------|------|
+|.|匹配任意字符(除了\n)|
+|[...]|匹配字符集|
+|\d / \D|匹配数字/非数字|
+|\s / \S|匹配空白/非空白字符|
+|\w / \W|匹配单词字符[a-zA-Z0-9]/非单词字符|
+|\*|匹配前一个字符0次或者无限次|
+|+|匹配前一个字符1次或者无限次|
+|？|匹配前一个字符0次或1次|
+|{m}/{m,n}|匹配前一个字符m次或者n次|
+|{m,}|匹配前一个字符m次或更多次|
+|\*? / +? / ??|匹配模式变为非贪婪(尽可能少匹配字符|)
+|^|匹配字符串开头|
+|$|匹配字符串结尾|
+|\A / \Z|指定的字符串匹配必须出现在开头/结尾|
+|\||匹配左右任意一个表达式|
+|(ab)|括号中表达式作为一个分组|
+|\\\<number>|引用编号为num的分组匹配到的字符串|
+|(?P\<name>)|分组起一个别名|
+|(?P=name)|引用别名为name的分组匹配字符串|
+
+## Python正则表达式--re模块其他方法
+1. search(pattern, string, flags=0)在一个字符串中查找匹配
+2. findall(pattern, string, flags=0)找到匹配，返回所有匹配部分的列表
+3. sub(pattern, repl, string, count=0, flags=0)将字符串中匹配正则表达式的部分替换为其它值
+4. split(pattern, string, maxsplit=0, flags=0)根据匹配分割字符串， 返回分割字符串组成的列表
+
+## re练习
+利用正则表达式尝试抓取了20160711[腾讯网](http://www.qq.com/)上的jpg和png图像。
@@ -0,0 +1,41 @@
+str1 = 'test python'
+# 未使用正则表达式的查找
+print(str1.find('1'))
+print(str1.find('test'))
+print(str1.startswith('test'))
+
+# 使用正则表达式查找
+import re
+# 将正则表达式编译成pattern对象
+# 使用r'test', r代表进行匹配的是元字符串
+pa = re.compile(r'test')    # pa已经成为一个pattern实例
+print(type(pa))
+
+ma = pa.match(str1)         # 若匹配成功, ma成为一个match对象
+
+print(ma)
+print(ma.group())           # group()返回一个str或者tuple
+print(ma.span())            # 返回字符串的索引
+print(ma.re)                # pattern的实例
+
+# 另一个例子
+pa2 = re.compile(r'_')
+ma2 = pa2.match('_value')
+
+print(ma2.group())
+
+# 例子3
+pa3 = re.compile(r'_')
+ma3 = pa3.match('value_')
+# print(ma3.group())          #匹配失败   'NoneType' object has no attribute 'group'
+
+# 忽略大小写匹配
+pa = re.compile(r'test', re.I)      # re.I 忽略大小写, I=ignore
+print(pa)
+
+ma = pa.match('Test python')
+print(ma.group())
+
+#
+ma = re.match(r'test', 'Test Python', re.I)
+print(ma.group())
@@ -0,0 +1,39 @@
+import re
+
+str1 = 'study time = 1000'
+
+print(str1.find('1000'))
+
+# 如果数字1000发生变化, 可以使用正则表达式的search查找数组, 其中\d+就是匹配任意多字符的数字
+# search仅能查找第一个符合项
+info = re.search(r'\d+', str1)
+print(info.group())
+
+str1 = 'study time = 10000'
+info = re.search(r'\d+', str1)
+print(info.group())
+
+# findall 能查找到所有匹配项, 返回值为一个list
+str2 = 'python code num = 100, swift code num = 50, c++ code num = 10'
+info = re.findall(r'\d+', str2)
+print(info)
+print(sum([int(x) for x in info]))
+
+# sub将字符中匹配正则表达式的部分替换为其它值
+str3 = 'python num = 1000'
+info = re.sub(r'\d+', '1001', str3)         # 将str3中的数字替换为1001
+print(info)
+
+# 利用函数将str3中的数字加1
+def add(match):
+    val = match.group()
+    num = int(val) + 1
+    return str(num)
+
+info = re.sub(r'\d+', add, str3)
+print(info)
+
+# split
+str4 = 'class=C C++ Java Python, C#'
+info = re.split(r'=| |,', str4)
+print(info)
@@ -0,0 +1,41 @@
+'''
+抓取腾讯主页中的图片到本地
+1. 抓取网页
+2. 抓取图片地址
+3. 抓取图片内容并保存到本地
+'''
+
+
+import requests
+def getdata():
+    url = 'http://www.qq.com/'
+    buf = requests.get(url)
+    return buf
+
+buf = getdata()
+
+import re
+from numpy import *
+
+# 抓取png和jpg图像
+listurl1 = re.findall(r'src="http://.*\.png', buf.text)
+listurl2 = re.findall(r'src="http://.*\.jpg', buf.text)
+listURL = []
+
+# 去除url上不必要的前缀
+for url in listurl1:
+     listURL.append(re.findall(r'http:.*\.png', url))
+for url in listurl2:
+    listURL.append(re.findall(r'http:.*\.jpg', url))
+
+# 将网上图片写入本地
+i = 0
+for url in listURL:
+    f = open(str(i)+'.png', 'wb')
+    buf = requests.get(url[0], stream=True)
+    for chunk in buf.iter_content(chunk_size=1024):
+        if chunk:
+            f.write(chunk)
+            f.flush()
+    f.close()
+    i += 1
@@ -0,0 +1,87 @@
+import re
+
+ma = re.match(r'a', 'a')
+print(ma.group())
+
+# 使用 '.' 匹配任意字符
+ma = re.match(r'.', 'b')
+print(ma.group())
+
+ma = re.match(r'{.}', '{a}')    # 匹配大括号中的任意一个字符
+print(ma.group())
+
+ma = re.match(r'{..}', '{ab}')  # 匹配大括号中的任意两个字符
+print(ma.group())
+
+# 使用 [...] 匹配字符集
+ma = re.match(r'{[abc]}', '{a}') # 匹配大括号中为a或b或c的值
+print(ma.group())
+
+ma = re.match(r'{[a-z]}', '{d}') # 匹配a-z的任意一个字符
+print(ma.group())
+
+ma = re.match(r'{[a-zA-Z]}', '{Z}')
+print(ma.group())
+
+ma = re.match(r'{[\w]}', '{0}') # 使用`\w`匹配a-zA-Z0-9的任意一个字符
+print(ma.group())
+
+# 对于中括号中的匹配, 需要加一个转意
+ma = re.match(r'\[[\w]\]', '[a]')   # '\['和'\]'用来转意
+print(ma.group())
+
+ma = re.match(r'[A-Z][a-z]', 'Aa')
+print(ma.group())
+
+ma = re.match(r'[A-Z][a-z]*', 'A') # 匹配一个大写字母和0个或无穷多个小写字母
+print(ma.group())
+ma = re.match(r'[A-Z][a-z]*', 'Aa')
+print(ma.group())
+ma = re.match(r'[A-Z][a-z]*', 'Aafdsfdsb')
+print(ma.group())
+ma = re.match(r'[A-Z][a-z]*', 'Aafdsfdsbadfas154154')       # 此例子只匹配数字前的字符
+print(ma.group())
+
+# 匹配以至少一个为_或者a-z或者A-Z开头的字符, 后面接无线多个_或者a-z或者A-Z。可以用于检测变量名是否合法
+# 注意这里 + 代表的是匹配前一个字符1次或者无限次
+ma = re.match(r'[_a-zA-Z]+[_\w]*', '__init')
+print(ma.group())
+
+ma = re.match(r'[1-9]?[0-9]', '99')             # 按照正常显示0-99的匹配, 09匹配不上, 9可以匹配上。?代表匹配一次或者0次
+print(ma.group())
+
+ma = re.match(r'[1-9]?[0-9]', '09')                 # 返回 0 , 不是09
+print(ma.group())
+
+ma = re.match(r'[a-zA-Z0-9]{6}', 'bac123')          # a-zA-Z0-9匹配六次
+print(ma.group())
+# ma = re.match(r'[a-zA-Z0-9]{6}', 'bac12')       # 返回空, 因为bac12长度小于6
+# print(ma.group())
+ma = re.match(r'[a-zA-Z0-9]{6}', 'bac1234')         # bac123
+print(ma.group())
+
+# 长度为6的163邮箱匹配
+ma = re.match(r'[a-zA-Z0-9]{6}@163.com', 'abc123@163.com')
+print(ma.group())
+# 程度为6-10的163邮箱匹配
+ma = re.match(r'[a-zA-Z0-9]{6,10}@163.com', 'abc123gd@163.com')
+print(ma.group())
+
+ma = re.match(r'[0-9][a-z]*?', '1bc')               # 只匹配第一个1, 采取非贪婪模式, 尽可能少匹配
+print(ma.group())
+
+ma = re.match(r'[0-9][a-z]+?', '1bc')               # 1b
+print(ma.group())
+
+# 匹配字符串的开头或者结尾
+ma = re.match(r'[\w]{4,10}@163.com', 'abc123gd@163.comabc')         # abc123gd@163.com 但是原始字符串不是我们想要的
+print(ma.group())
+# ma = re.match(r'[\w]{4,10}@163.com$', 'abc123gd@163.comabc')    # 报错, 加上$后, 结尾必须是@163.com
+# print(ma.group())
+
+ma = re.match(r'^[\w]{4,10}@163.com$', 'abc123gd@163.com')          # 前面加上 ^  , 必须以[\w]开头
+print(ma.group())
+
+
+ma = re.match(r'\Aabc[\w]{4,10}@163.com$', 'abc123gd@163.com')      # \A 限定了开头必须以 abc 开头
+print(ma.group())