-
-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Expand file tree
/
Copy pathtranslator.py
More file actions
134 lines (105 loc) · 5.23 KB
/
translator.py
File metadata and controls
134 lines (105 loc) · 5.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
"""
from functools import lru_cache
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from scrapling.core._types import Any, Protocol, Self
class XPathExpr(OriginalXPathExpr):
textnode: bool = False
attribute: str | None = None
@classmethod
def from_xpath(
cls,
xpath: OriginalXPathExpr,
textnode: bool = False,
attribute: str | None = None,
) -> Self:
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
x.textnode = textnode
x.attribute = attribute
return x
def __str__(self) -> str:
path = super().__str__()
if self.textnode:
if path == "*": # pragma: no cover
path = "text()"
elif path.endswith("::*/*"): # pragma: no cover
path = path[:-3] + "text()"
else:
path += "/text()"
if self.attribute is not None:
if path.endswith("::*/*"): # pragma: no cover
path = path[:-2]
path += f"/@{self.attribute}"
return path
def join(
self: Self,
combiner: str,
other: OriginalXPathExpr,
*args: Any,
**kwargs: Any,
) -> Self:
if not isinstance(other, XPathExpr):
raise ValueError( # pragma: no cover
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
f" of the same type (or its descendants), got {type(other)}"
)
super().join(combiner, other, *args, **kwargs)
self.textnode = other.textnode
self.attribute = other.attribute
return self
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
class TranslatorProtocol(Protocol):
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
pass
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
pass
class TranslatorMixin:
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""
def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
# https://github.com/python/mypy/issues/14757
xpath = super().xpath_element(selector) # type: ignore[safe-super]
return XPathExpr.from_xpath(xpath)
def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
"""
Dispatch method that transforms XPath to support the pseudo-element.
"""
if isinstance(pseudo_element, FunctionalPseudoElement):
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
method = getattr(self, method_name, None)
if not method: # pragma: no cover
raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
xpath = method(xpath, pseudo_element)
else:
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
method = getattr(self, method_name, None)
if not method: # pragma: no cover
raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
xpath = method(xpath)
return xpath
@staticmethod
def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
@staticmethod
def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
return XPathExpr.from_xpath(xpath, textnode=True)
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
return super().css_to_xpath(css, prefix)
translator = HTMLTranslator()
# Using a function instead of the translator directly to avoid Pyright override error
@lru_cache(maxsize=256)
def css_to_xpath(query: str) -> str:
"""Return the translated XPath version of a given CSS query"""
return translator.css_to_xpath(query)