Skip to content

Commit

Permalink
new(kernel_crawler): opensuse tumbleweed switched to zstd for its rep…
Browse files Browse the repository at this point in the history
…odata.

Signed-off-by: Federico Di Pierro <[email protected]>
  • Loading branch information
FedeDP authored and poiana committed Nov 17, 2023
1 parent 03ac9c3 commit 5e01bc2
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 9 deletions.
5 changes: 3 additions & 2 deletions kernel_crawler/opensuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ def __init__(self, arch):
# the rest
rpm.SUSERpmMirror('https://mirrors.edge.kernel.org/opensuse/distribution/', 'repo/oss/', arch),
rpm.SUSERpmMirror('https://mirrors.edge.kernel.org/opensuse/distribution/', 'repo/oss/suse/', arch),
# opensuse site: tumbleweed
rpm.SUSERpmMirror('http://download.opensuse.org/', 'repo/oss/', arch, tumbleweed_filter),
# opensuse site: tumbleweed -> enforce zstd for repo:
# https://lists.opensuse.org/archives/list/[email protected]/thread/LJNSBPCMIOJMP37PFPV7C7EJVIOW26BN/
rpm.SUSERpmMirror('http://download.opensuse.org/', 'repo/oss/', arch, tumbleweed_filter, True),
# opensuse site: leaps
rpm.SUSERpmMirror('http://download.opensuse.org/distribution/leap/', 'repo/oss/', arch),
# opensuse Kernel repo - common
Expand Down
27 changes: 21 additions & 6 deletions kernel_crawler/rpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import sqlite3
import tempfile
import re
import zstandard as zstd
import io

from . import repo
from kernel_crawler.utils.download import get_url
Expand Down Expand Up @@ -166,7 +168,7 @@ def list_repos(self):

class SUSERpmMirror(RpmMirror):

def __init__(self, base_url, variant, arch, repo_filter=None):
def __init__(self, base_url, variant, arch, repo_filter=None, isZstd=False):
'''
SUSERpmMirror looks like a regular RpmMirror, except that it requires
the arch in the constructor. The arch is used for passing through to SUSERpmRepository,
Expand All @@ -179,6 +181,7 @@ def __init__(self, base_url, variant, arch, repo_filter=None):
repo_filter = lambda _: True
self.repo_filter = repo_filter
self.url = base_url
self.isZstd = isZstd

def list_repos(self):
'''
Expand All @@ -194,7 +197,7 @@ def list_repos(self):
dists = dists.content
doc = html.fromstring(dists, self.base_url)
dists = doc.xpath('/html/body//a[not(@href="../")]/@href')
ret = [SUSERpmRepository(self.dist_url(dist), self.arch) for dist in dists
ret = [SUSERpmRepository(self.dist_url(dist), self.arch, self.isZstd) for dist in dists
if dist.endswith('/')
and not dist.startswith('/')
and not dist.startswith('?')
Expand All @@ -210,13 +213,14 @@ class SUSERpmRepository(RpmRepository):
# the kernel headers package name pattern to search for in the package listing XML
_kernel_devel_pattern = 'kernel-default-devel-'

def __init__(self, base_url, arch):
def __init__(self, base_url, arch, isZstd):
'''
Constructor, which sets the base URL and the arch.
The arch is used for finding the correct package in the repomd.
'''
self.base_url = base_url
self.arch = arch
self.isZstd = isZstd

def get_repodb_url(self):
'''
Expand Down Expand Up @@ -279,11 +283,22 @@ def get_package_tree(self, filter=''):
with tempfile.NamedTemporaryFile() as tf:
tf.write(repodb)
tf.flush()
open_mode = 'r'
if self.isZstd:
open_mode = 'rb'

# regex searching through a file is more memory efficient
# than parsing the xml into an object structure with lxml etree
search = re.search(f'.*href="({package_match}.*rpm)', str(open(tf.name).read()))
if search:
kernel_default_devel_pkg_url = search.group(1)
with open(tf.name, mode=open_mode) as f:
if self.isZstd:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
stream_reader = dctx.stream_reader(f)
text = io.TextIOWrapper(stream_reader, encoding='utf-8').read()
else:
text = str(f.read())
search = re.search(f'.*href="({package_match}.*rpm)', text)
if search:
kernel_default_devel_pkg_url = search.group(1)
tf.close() # delete the tempfile to free up memory

# check to ensure a kernel_devel_pkg was found
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ semantic-version
pygit2
beautifulsoup4
rpmfile
zstandard
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
'semantic-version',
'pygit2',
'beautifulsoup4',
'rpmfile'
'rpmfile',
'zstandard'
],
)

0 comments on commit 5e01bc2

Please sign in to comment.