forked from Dao-zhi/GoogleScholarGUI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xpath.py
112 lines (95 loc) · 3.78 KB
/
xpath.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="https://ask.hellobi.com/link1.html">first item</a></li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-inactive"><a href="https://ask.hellobi.com/link3.html">third item</a></li>
<li class="item-1"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a>
<li class="li li-first"><a href="https://ask.hellobi.com/link.html">sixth item</a></li>
<li class="li li-first" name="item"><a href="https://ask.hellobi.com/link.html">seventh item</a></li>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print('-----解析节果-----')
# print(result.decode('utf-8')) # 输出解析后的节果
# 从文件中解析html
# html = etree.parse('./test.html', etree.HTMLParser())
# result = etree.tostring(html)
# print(result.decode('utf-8')) # 输出解析后的节果
# 所有节点
print('-----所有节点-----')
result = html.xpath('//*')
print(result)
# 所有li节点
print('-----所有li节点-----')
result = html.xpath('//li')
print(result)
print(result[0])
# 子节点
print('-----子节点-----')
result = html.xpath('//li/a') # li 节点的所有直接 a 子节点
print(result)
result = html.xpath('//ul//a') # ul 节点下的所有子孙 a 节点
print(result)
# 父节点
print('-----父节点-----')
result = html.xpath('//a[@href="https://ask.hellobi.com/link4.html"]/../@class')
print(result)
result = html.xpath('//a[@href="https://ask.hellobi.com/link4.html"]/parent::*/@class')
print(result)
# 属性匹配
print('-----属性匹配-----')
result = html.xpath('//li[@class="item-0"]')
print(result)
# 文本获取
print('-----文本获取-----')
result = html.xpath('//li[@class="item-0"]/text()')
print(result)
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)
result = html.xpath('//li[@class="item-0"]//text()')
print(result)
# 属性获取
print('-----属性获取-----')
result = html.xpath('//li/a/@href')
print(result)
# 属性多值匹配
print('-----属性多值匹配-----')
result = html.xpath('//li[@class="li"]/a/text()') # 无法匹配
print(result)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
# 多属性匹配
print('-----多属性匹配-----')
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()') # class=li并且name=item
print(result)
# 按序选择
print('-----按序选择-----')
result = html.xpath('//li[1]/a/text()') # 第一个
print(result)
result = html.xpath('//li[last()]/a/text()') # 最后一个
print(result)
result = html.xpath('//li[position()<3]/a/text()') # 前两个
print(result)
result = html.xpath('//li[last()-2]/a/text()') # 倒数第三个
print(result)
# 节点轴选择
print('-----节点轴选择-----')
result = html.xpath('//li[1]/ancestor::*') # 所有祖先节点
print(result)
result = html.xpath('//li[1]/ancestor::div') # 所有祖先节点中的div
print(result)
result = html.xpath('//li[1]/attribute::*') # 所有属性
print(result)
result = html.xpath('//li[1]/child::a[@href="https://ask.hellobi.com/link1.html"]') # 子节点中href为...的节点
print(result)
result = html.xpath('//li[1]/descendant::span') # 所有孙子节点中的span
print(result)
result = html.xpath('//li[1]/following::*[2]') # 当前节点之后的所有节点,这里我们虽然使用的是 * 匹配,但又加了索引选择,所以只获取了第二个后续节点。
print(result)
result = html.xpath('//li[1]/following-sibling::*') # 当前节点之后的所有同级节点,这里我们使用的是 * 匹配,所以获取了所有后续同级节点
print(result)