Skip to content

Commit b617978

Browse files
committed
Implemented bulk replacing in files.
1 parent b25084a commit b617978

9 files changed

Lines changed: 333 additions & 5 deletions

File tree

https_everywhere/__main__.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import asyncio
2+
import sys
3+
from concurrent.futures import ThreadPoolExecutor
4+
from functools import partial
5+
from os import cpu_count
6+
from pathlib import Path
7+
8+
from binaryornot.check import is_binary
9+
from plumbum import cli
10+
11+
from .core import CombinedReplacerFactory
12+
from .core.InBufferReplacer import InBufferReplacer
13+
from .core.InFileReplacer import InFileReplacer
14+
from .replacers.HEReplacer import HEReplacer
15+
from .replacers.HSTSPreloadReplacer import HSTSPreloadReplacer
16+
17+
18+
class OurInBufferReplacer(InBufferReplacer):
19+
__slots__ = ()
20+
FACS = CombinedReplacerFactory(
21+
{
22+
"preloads": HSTSPreloadReplacer,
23+
"heRulesets": HEReplacer,
24+
}
25+
)
26+
27+
def __init__(self, preloads=None, heRulesets=None):
28+
super().__init__(preloads=preloads, heRulesets=heRulesets)
29+
30+
31+
class OurInFileReplacer(InFileReplacer):
32+
def __init__(self, preloads=None, heRulesets=None):
33+
super().__init__(OurInBufferReplacer(preloads=preloads, heRulesets=heRulesets))
34+
35+
36+
class CLI(cli.Application):
37+
"""HTTPSEverywhere-like URI rewriter"""
38+
39+
40+
@CLI.subcommand("bulk")
41+
class FileRewriteCLI(cli.Application):
42+
"""Rewrites URIs in files. Use - to consume list of files from stdin. Don't use `find`, it is a piece of shit which is impossible to configure to skip .git dirs."""
43+
44+
__slots__ = ("_repl",)
45+
46+
@property
47+
def repl(self):
48+
if self._repl is None:
49+
self._repl = OurInFileReplacer()
50+
print(
51+
len(self._repl.inBufferReplacer.singleURIReplacer.children[0].preloads),
52+
"HSTS preloads",
53+
)
54+
print(
55+
len(self._repl.inBufferReplacer.singleURIReplacer.children[1].rulesets), "HE rules"
56+
)
57+
return self._repl
58+
59+
def processEachFileName(self, l):
60+
l = l.strip()
61+
if l:
62+
l = l.decode("utf-8")
63+
return self.processEachFilePath(Path(l).resolve().absolute())
64+
65+
def processEachFilePath(self, p):
66+
for pa in p.parts:
67+
if not self.noSkipDot and pa[0] == ".":
68+
print("Skipping ", p, ": dotfile")
69+
return
70+
71+
if not p.is_dir():
72+
if self.noSkipBinary or not is_binary(p):
73+
self.repl(p)
74+
else:
75+
print("Skipping ", p, ": binary")
76+
77+
@asyncio.coroutine
78+
def asyncMainPathsFromStdIn(self):
79+
conc = []
80+
asyncStdin = asyncio.StreamReader(loop=self.loop)
81+
yield from self.loop.connect_read_pipe(
82+
lambda: asyncio.StreamReaderProtocol(asyncStdin, loop=self.loop), sys.stdin
83+
)
84+
with ThreadPoolExecutor(max_workers=cpu_count()) as pool:
85+
while not asyncStdin.at_eof():
86+
l = yield from asyncStdin.readline()
87+
yield from self.loop.run_in_executor(pool, partial(self.processEachFileName, l))
88+
89+
@asyncio.coroutine
90+
def asyncMainPathsFromCLI(self, filesOrDirs):
91+
try:
92+
from tqdm import tqdm
93+
except ImportError:
94+
95+
def tqdm(x):
96+
return x
97+
98+
for fileOrDir in tqdm(filesOrDirs):
99+
fileOrDir = Path(fileOrDir).resolve().absolute()
100+
if fileOrDir.is_dir():
101+
files = [el for el in fileOrDir.glob("**/*") if not el.is_dir()]
102+
print(files)
103+
else:
104+
files = [fileOrDir]
105+
106+
if files:
107+
with ThreadPoolExecutor(max_workers=cpu_count()) as pool:
108+
for f in files:
109+
yield from self.loop.run_in_executor(pool, partial(self.processEachFilePath, f))
110+
111+
noSkipBinary = cli.Flag(
112+
["--no-skip-binary", "-n"],
113+
help="Don't skip binary files. Allows usage without `binaryornot`",
114+
default=False,
115+
)
116+
noSkipDot = cli.Flag(
117+
["--no-skip-dotfiles", "-d"],
118+
help="Don't skip files and dirs which name stem begins from dot.",
119+
default=False,
120+
)
121+
122+
def main(self, *filesOrDirs):
123+
self._repl = None
124+
self.loop = asyncio.get_event_loop()
125+
126+
if len(filesOrDirs) == 1 and filesOrDirs[0] == "0":
127+
t = self.loop.create_task(self.asyncMainPathsFromStdIn())
128+
else:
129+
t = self.loop.create_task(self.asyncMainPathsFromCLI(filesOrDirs))
130+
self.loop.run_until_complete(t)
131+
132+
133+
if __name__ == "__main__":
134+
CLI.run()

https_everywhere/adapter.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from logging_helper import setup_logging
44

55
import urllib3
6-
from urllib3.util.url import parse_url
76

87
import requests
98
from requests.adapters import HTTPAdapter
@@ -13,6 +12,7 @@
1312
from ._chrome_preload_hsts import _preload_including_subdomains
1413
from ._mozilla_preload_hsts import _preload_remove_negative
1514
from ._util import _check_in
15+
from .replacers.HSTSPreloadReplacer import apply_HSTS_preload
1616

1717
PY2 = str != "".__class__
1818
if PY2:
@@ -155,10 +155,7 @@ def __init__(self, *args, **kwargs):
155155

156156
def get_redirect(self, url):
157157
if url.startswith("http://"):
158-
p = parse_url(url)
159-
if _check_in(self._domains, p.host):
160-
new_url = "https:" + url[5:]
161-
return new_url
158+
return apply_HSTS_preload(url, self._domains)
162159

163160
return super(PreloadHSTSAdapter, self).get_redirect(url)
164161

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import re
2+
3+
from urllib3.util.url import parse_url
4+
5+
from . import ReplaceContext, SingleURIReplacer
6+
7+
uri_re = re.compile(
8+
"(?:http|ftp):\\/\\/?((?:[\\w-]+)(?::[\\w-]+)?@)?[\\w\\.:()-]+(?:\\/[\\w\\.:()/-]*)?"
9+
)
10+
11+
12+
class InBufferReplacer(SingleURIReplacer):
13+
__slots__ = ("singleURIReplacer",)
14+
FACS = None
15+
16+
def __init__(self, **kwargs):
17+
self.singleURIReplacer = self.__class__.FACS(**kwargs)
18+
19+
def _rePlaceFunc(self, m):
20+
uri = m.group(0)
21+
ctx = ReplaceContext(uri)
22+
self.singleURIReplacer(ctx)
23+
if ctx.count > 0:
24+
return ctx.res
25+
return uri
26+
27+
def __call__(self, inputStr):
28+
return ReplaceContext(*uri_re.subn(self._rePlaceFunc, inputStr))
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from os import close
2+
from pathlib import Path
3+
from tempfile import NamedTemporaryFile
4+
5+
6+
class InFileReplacer:
7+
__slots__ = ("inBufferReplacer", "encoding")
8+
9+
def __init__(self, inBufferReplacer, encoding="utf-8"):
10+
self.inBufferReplacer = inBufferReplacer
11+
self.encoding = encoding
12+
13+
def __call__(self, inputFilePath, safe=True):
14+
if safe:
15+
return self.safe(inputFilePath)
16+
return self.unsafe(inputFilePath)
17+
18+
def safe(self, inputFilePath):
19+
replaced = 0
20+
fo = None
21+
22+
try:
23+
with open(inputFilePath, "rt", encoding=self.encoding) as fi:
24+
while True:
25+
l = fi.readline()
26+
if not l:
27+
break
28+
29+
ctx = self.inBufferReplacer(l)
30+
if ctx.count:
31+
if not fo:
32+
fo = NamedTemporaryFile(
33+
mode="at",
34+
encoding=self.encoding,
35+
suffix="new",
36+
prefix=inputFilePath.stem,
37+
dir=inputFilePath.parent,
38+
delete=False,
39+
).__enter__()
40+
pBk = fi.tell()
41+
fi.seek(0)
42+
beginning = fi.read(pBk - len(l))
43+
fo.write(beginning)
44+
fi.seek(pBk)
45+
fo.write(ctx.res)
46+
replaced += ctx.count
47+
else:
48+
if fo:
49+
fo.write(l)
50+
51+
except BaseException as ex:
52+
if fo:
53+
fo.__exit__(type(ex), ex, None)
54+
tmpFilePath = Path(fo.name)
55+
if tmpFilePath.exists():
56+
tmpFilePath.unlink()
57+
raise ex
58+
else:
59+
if fo:
60+
fo.__exit__(None, None, None)
61+
Path(fo.name).rename(inputFilePath)
62+
return replaced
63+
64+
def unsafe(self, inputFilePath):
65+
from warnings import warn
66+
67+
warn("Unsafe in-place editing is not yet implamented")
68+
return self.safe(inputFilePath)

https_everywhere/core/__init__.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from functools import partial
2+
3+
4+
class ReplaceContext:
5+
__slots__ = ("res", "shouldStop", "count")
6+
7+
def __init__(self, res, count=0, shouldStop=False):
8+
self.res = res
9+
self.shouldStop = shouldStop
10+
self.count = count
11+
12+
13+
class SingleURIReplacer:
14+
def __init__(self, arg):
15+
raise NotImplementedError
16+
17+
def __call__(self, ctx):
18+
raise NotImplementedError
19+
20+
21+
class CombinedReplacer(SingleURIReplacer):
22+
__slots__ = ("children",)
23+
24+
def __init__(self, children):
25+
self.children = children
26+
27+
def __call__(self, ctx):
28+
for r in self.children:
29+
r(ctx)
30+
if ctx.shouldStop:
31+
break
32+
return ctx
33+
34+
35+
class CombinedReplacerFactory:
36+
__slots__ = ("args2Ctors", "ctor")
37+
38+
def __init__(self, args2Ctors):
39+
self.args2Ctors = args2Ctors
40+
41+
def _gen_replacers(self, kwargs):
42+
for k, v in kwargs.items():
43+
c = self.args2Ctors.get(k, None)
44+
if c:
45+
yield c(v)
46+
47+
def __call__(self, **kwargs):
48+
return CombinedReplacer(tuple(self._gen_replacers(kwargs)))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from .. import _rules
2+
from .._rules import _get_rulesets, https_url_rewrite
3+
from ..core import SingleURIReplacer
4+
5+
6+
class HEReplacer(SingleURIReplacer):
7+
__slots__ = ("rulesets",)
8+
9+
def __init__(self, rulesets):
10+
if rulesets is None:
11+
_get_rulesets()
12+
rulesets = _rules._DATA
13+
self.rulesets = rulesets
14+
15+
def __call__(self, ctx):
16+
prevRes = ctx.res
17+
ctx.res = https_url_rewrite(ctx.res, self.rulesets)
18+
if prevRes != ctx.res:
19+
ctx.count += 1
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from urllib3.util.url import parse_url
2+
3+
from .._chrome_preload_hsts import \
4+
_preload_including_subdomains as _get_preload_chrome
5+
from .._mozilla_preload_hsts import \
6+
_preload_remove_negative as _get_preload_mozilla
7+
from .._util import _check_in
8+
from ..core import SingleURIReplacer
9+
10+
11+
def apply_HSTS_preload(url, domains):
12+
p = parse_url(url)
13+
if _check_in(domains, p.host):
14+
new_url = "https:" + url[len(p.scheme) + 1:]
15+
return new_url
16+
return url
17+
18+
19+
class HSTSPreloadReplacer(SingleURIReplacer):
20+
__slots__ = ("preloads",)
21+
22+
def __init__(self, preloads):
23+
if preloads is None:
24+
preloads = _get_preload_mozilla() | _get_preload_chrome()
25+
self.preloads = preloads
26+
27+
def __call__(self, ctx):
28+
prevRes = ctx.res
29+
ctx.res = apply_HSTS_preload(ctx.res, self.preloads)
30+
if prevRes != ctx.res:
31+
ctx.count += 1

https_everywhere/replacers/__init__.py

Whitespace-only changes.

setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,7 @@
6161
classifiers=classifiers.splitlines(),
6262
tests_require=["unittest-expander", "lxml", "tldextract", "regex"],
6363
# lxml is optional, needed for testing upstream rules
64+
entry_points = {
65+
"console_scripts": ["pyhttpeverywhere = https_everywhere.__main__:CLI"]
66+
}
6467
)

0 commit comments

Comments
 (0)