From 33f84ca24524c3c97fe3b694bf594ccf87152c54 Mon Sep 17 00:00:00 2001 From: chang-ning Date: Wed, 2 Mar 2016 08:19:30 +0800 Subject: [PATCH] Add some regular expression note about html tag --- docs/source/notes/python-rexp.rst | 61 +++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/docs/source/notes/python-rexp.rst b/docs/source/notes/python-rexp.rst index c0ad72b1..64241d23 100644 --- a/docs/source/notes/python-rexp.rst +++ b/docs/source/notes/python-rexp.rst @@ -2,6 +2,67 @@ Python Regular Expression cheatsheet ==================================== +Compare HTML tags +----------------- + ++------------+--------------+--------------+ +| tag type | format | example | ++============+==============+==============+ +| all tag | <[^>]+> |
, | ++------------+--------------+--------------+ +| open tag | <[^/>][^>]*> | , | ++------------+--------------+--------------+ +| close tag | ]+> |

, | ++------------+--------------+--------------+ +| self close | <[^/>]+/> |
| ++------------+--------------+--------------+ + + +.. code-block:: python + + # open tag + >>> re.search('<[^/>][^>]*>', '
') != None + True + >>> re.search('<[^/>][^>]*>', '') != None + True + >>> re.search('<[^/>][^>]*>', '') != None + True + >>> re.search('<[^/>][^>]*>', '
') != None + False + + # close tag + >>> re.search(']+>', '') != None + True + + # self close + >>> re.search('<[^/>]+/>', '
') != None + True + +``re.findall()`` match string +----------------------------- + +.. code-block:: python + + # split all string + >>> re.findall('[\w]+', source) + ['Hello', 'World', 'Ker', 'HAHA'] + + # parsing python.org website + >>> import urllib + >>> import re + >>> s = urllib.urlopen('https://www.python.org') + >>> html = s.read() + >>> s.close() + >>> print "open tags" + open tags + >>> re.findall('<[^/>][^>]*>', html)[0:2] + ['', '