Manylinux downloader and extractor

This commit is contained in:
Valentin Niess
2025-05-19 17:13:29 +02:00
parent 08ee36fc45
commit 1fdb439e70
7 changed files with 581 additions and 15 deletions

View File

@@ -77,7 +77,7 @@ _excluded_libs = None
def patch_binary(path, libdir, recursive=True):
'''Patch the RPATH of a binary and and fetch its dependencies
'''Patch the RPATH of a binary and fetch its dependencies
'''
global _excluded_libs

View File

@@ -0,0 +1,8 @@
from .config import Arch, LinuxTag, PythonImpl, PythonVersion
from .download import Downloader
from .extract import ImageExtractor, PythonExtractor
__all__ = ['Arch', 'Downloader', 'ImageExtractor', 'LinuxTag',
'PythonExtractor', 'PythonImpl', 'PythonVersion']

View File

@@ -0,0 +1,81 @@
from enum import auto, Enum
import platform
from typing import NamedTuple, Union
__all__ = ['Arch', 'PythonImpl', 'PythonVersion']
class Arch(Enum):
'''Supported platform architectures.'''
AARCH64 = auto()
I686 = auto()
X86_64 = auto()
def __str__(self):
return self.name.lower()
@classmethod
def from_host(cls) -> 'Arch':
return cls.from_str(platform.machine())
@classmethod
def from_str(cls, value) -> 'Arch':
for arch in cls:
if value == str(arch):
return arch
else:
raise NotImplementedError(value)
class LinuxTag(Enum):
'''Supported platform tags.'''
MANYLINUX_1 = auto()
MANYLINUX_2010 = auto()
MANYLINUX_2014 = auto()
MANYLINUX_2_24 = auto()
MANYLINUX_2_28 = auto()
def __str__(self):
tag = self.name.lower()
if self in (LinuxTag.MANYLINUX_1, LinuxTag.MANYLINUX_2010,
LinuxTag.MANYLINUX_2014):
return tag.replace('_', '')
else:
return tag
@classmethod
def from_str(cls, value) -> 'LinuxTag':
for tag in cls:
if value == str(tag):
return tag
else:
raise NotImplementedError(value)
class PythonImpl(Enum):
'''Supported Python implementations.'''
CPYTHON = auto()
class PythonVersion(NamedTuple):
''''''
major: int
minor: int
patch: Union[int, str]
@classmethod
def from_str(cls, value: str) -> 'PythonVersion':
major, minor, patch = value.split('.', 2)
try:
patch = int(patch)
except ValueError:
pass
return cls(int(major), int(minor), patch)
def long(self) -> str:
return f'{self.major}.{self.minor}.{self.patch}'
def short(self) -> str:
return f'{self.major}.{self.minor}'

View File

@@ -0,0 +1,145 @@
import collections
from dataclasses import dataclass, field
import glob
import hashlib
import json
from pathlib import Path
import requests
import shutil
import tempfile
from typing import List, Optional
from .config import Arch, LinuxTag
from ..utils.log import debug, log
CHUNK_SIZE = 8189
SUCCESS = 200
class DownloadError(Exception):
pass
class TarError(Exception):
pass
@dataclass(frozen=True)
class Downloader:
'''Manylinux tag.'''
tag: LinuxTag
'''Platform architecture.'''
arch: Optional[Arch] = None
'''Docker image.'''
image: str = field(init=False)
'''Authentication token.'''
token: str = field(init=False)
def __post_init__(self):
# Set host arch if not explictly specified.
if self.arch is None:
arch = Arch.from_host()
object.__setattr__(self, 'arch', arch)
# Set image name.
image = f'{self.tag}_{self.arch}'
object.__setattr__(self, 'image', image)
def download(
self,
destination: Optional[Path]=None,
tag: Optional[str] = 'latest'):
destination = destination or Path(self.image)
# Authenticate to quay.io.
repository = f'pypa/{self.image}'
url = 'https://quay.io/v2/auth'
url = f'{url}?service=quay.io&scope=repository:{repository}:pull'
debug('GET', url)
r = requests.request('GET', url)
if r.status_code == SUCCESS:
object.__setattr__(self, 'token', r.json()['token'])
else:
raise DownloadError(r.status_code, r.text, r.headers)
# Fetch image manifest.
repository = f'pypa/{self.image}'
url = f'https://quay.io/v2/{repository}/manifests/{tag}'
headers = {
'Authorization': f'Bearer {self.token}',
'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
}
debug('GET', url)
r = requests.request('GET', url, headers=headers)
if r.status_code == SUCCESS:
image_digest = r.headers['Docker-Content-Digest'].split(':', 1)[-1]
manifest = r.json()
else:
raise DownloadError(r.status_code, r.text, r.headers)
# Check missing layers to download.
required = [layer['digest'].split(':', 1)[-1] for layer in
manifest['layers']]
is_missing = lambda hash_: \
not (destination / f'layers/{hash_}.tar.gz').exists()
missing = tuple(filter(is_missing, required))
# Fetch missing layers.
with tempfile.TemporaryDirectory() as tmpdir:
workdir = Path(tmpdir)
for i, hash_ in enumerate(missing):
log('DOWNLOAD', f'{self.image} ({tag}) '
f'[{i + 1} / {len(missing)}]')
filename = f'{hash_}.tar.gz'
url = f'https://quay.io/v2/{repository}/blobs/sha256:{hash_}'
debug('GET', url)
r = requests.request('GET', url, headers=headers, stream=True)
if r.status_code == SUCCESS:
debug('STREAM', filename)
else:
raise DownloadError(r.status_code, r.text, r.headers)
hasher = hashlib.sha256()
tmp = workdir / 'layer.tgz'
with open(tmp, "wb") as f:
for chunk in r.iter_content(CHUNK_SIZE):
if chunk:
f.write(chunk)
hasher.update(chunk)
h = hasher.hexdigest()
if h != hash_:
raise DownloadError(
f'bad hash (expected {name}, found {h})'
)
layers_dir = destination / 'layers'
layers_dir.mkdir(exist_ok=True, parents=True)
shutil.move(tmp, layers_dir / filename)
tags_dir = destination / 'tags'
tags_dir.mkdir(exist_ok=True, parents=True)
with open(tags_dir / f'{tag}.json', "w") as f:
json.dump({'digest': image_digest, 'layers': required}, f)
# Remove unused layers.
required = set(required)
for tag in glob.glob(str(destination / 'tags/*.json')):
with open(tag) as f:
tag = json.load(f)
required |= set(tag["layers"])
required = [f'{hash_}.tar.gz' for hash_ in required]
for layer in glob.glob(str(destination / 'layers/*.tar.gz')):
layer = Path(layer)
if layer.name not in required:
debug('REMOVE', f'{self.image} [layer/{layer.stem}]')
layer.unlink()

View File

@@ -0,0 +1,327 @@
from dataclasses import dataclass, field
from distutils.version import LooseVersion
import glob
import json
import os
import re
from pathlib import Path
import shutil
import stat
import subprocess
from typing import Dict, List, NamedTuple, Optional, Union
from .config import Arch, PythonImpl, PythonVersion
from ..utils.deps import ensure_excludelist, EXCLUDELIST
from ..utils.log import debug, log
@dataclass(frozen=True)
class PythonExtractor:
'''Python extractor from an extracted Manylinux image.'''
arch: Arch
'''Target architecture'''
prefix: Path
'''Target image path'''
tag: str
'''Python binary tag'''
excludelist: Optional[Path] = None
'''Exclude list for shared libraries.'''
patchelf: Optional[Path] = None
'''Patchelf executable.'''
excluded: List[str] = field(init=False)
'''Excluded shared libraries.'''
impl: PythonImpl = field(init=False)
'''Python implementation'''
library_path: List[str] = field(init=False)
'''Search paths for libraries (LD_LIBRARY_PATH)'''
python_prefix: Path = field(init=False)
'''Python installation prefix'''
version: PythonVersion = field(init=False)
'''Python version'''
def __post_init__(self):
# Locate Python installation.
link = os.readlink(self.prefix / f'opt/python/{self.tag}')
if not link.startswith('/'):
raise NotImplementedError()
object.__setattr__(self, 'python_prefix', self.prefix / link[1:])
# Parse implementation and version.
head, tail = Path(link).name.split('-', 1)
if head == 'cpython':
impl = PythonImpl.CPYTHON
version = PythonVersion.from_str(tail)
else:
raise NotImplementedError()
object.__setattr__(self, 'impl', impl)
object.__setattr__(self, 'version', version)
# Set libraries search path.
paths = []
if self.arch in (Arch.AARCH64, Arch.X86_64):
paths.append(self.prefix / 'lib64')
elif self.arch == Arch.I686:
paths.append(self.prefix / 'lib')
else:
raise NotImplementedError()
paths.append(self.prefix / 'usr/local/lib')
ssl = glob.glob(str(self.prefix / 'opt/_internal/openssl-*'))
if ssl:
paths.append(Path(ssl[0]) / 'lib')
object.__setattr__(self, 'library_path', paths)
# Set excluded libraries.
if self.excludelist:
excludelist = Path(self.excludelist)
else:
ensure_excludelist()
excludelist = Path(EXCLUDELIST)
excluded = []
with excludelist.open() as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
excluded.append(line)
object.__setattr__(self, 'excluded', excluded)
# Set patchelf, if not provided.
if self.patchelf is None:
paths = (
Path(__file__).parent / 'bin',
Path.home() / '.local/bin'
)
for path in paths:
patchelf = path / 'patchelf'
if patchelf.exists():
break
else:
raise NotImplementedError()
object.__setattr__(self, 'patchelf', patchelf)
else:
assert(self.patchelf.exists())
def extract(self, destination):
'''Extract Python runtime.'''
python = f'python{self.version.short()}'
runtime = f'bin/{python}'
packages = f'lib/{python}'
pip = f'bin/pip{self.version.short()}'
# Locate include files.
include = glob.glob(str(self.python_prefix / 'include/*'))
if include:
include = Path(include[0]).name
include = f'include/{include}'
else:
raise NotImplementedError()
# Clone Python runtime.
(destination / 'bin').mkdir(exist_ok=True, parents=True)
shutil.copy(self.python_prefix / runtime, destination / runtime)
short = Path(destination / f'bin/python{self.version.major}')
short.unlink(missing_ok=True)
short.symlink_to(python)
short = Path(destination / 'bin/python')
short.unlink(missing_ok=True)
short.symlink_to(f'python{self.version.major}')
# Clone pip wrapper.
with open(self.python_prefix / pip) as f:
f.readline() # Skip shebang.
body = f.read()
with open(destination / pip, 'w') as f:
f.write('#! /bin/sh\n')
f.write(' '.join((
'"exec"',
f'"$(dirname $(readlink -f ${0}))/{python}"',
'"$0"',
'"$@"\n'
)))
f.write(body)
shutil.copymode(self.python_prefix / pip, destination / pip)
short = Path(destination / f'bin/pip{self.version.major}')
short.unlink(missing_ok=True)
short.symlink_to(f'pip{self.version.short()}')
short = Path(destination / 'bin/pip')
short.unlink(missing_ok=True)
short.symlink_to(f'pip{self.version.major}')
# Clone Python packages.
for folder in (packages, include):
shutil.copytree(self.python_prefix / folder, destination / folder,
symlinks=True, dirs_exist_ok=True)
# Remove some clutters.
shutil.rmtree(destination / packages / 'test', ignore_errors=True)
for root, dirs, files in os.walk(destination / packages):
root = Path(root)
for d in dirs:
if d == '__pycache__':
shutil.rmtree(root / d, ignore_errors=True)
for f in files:
if f.endswith('.pyc'):
(root / f).unlink()
# Map binary dependencies.
libs = self.ldd(self.python_prefix / f'bin/{python}')
path = Path(self.python_prefix / f'{packages}/lib-dynload')
for module in glob.glob(str(path / "*.so")):
l = self.ldd(module)
libs.update(l)
# Copy and patch binary dependencies.
libdir = destination / 'lib'
for (name, src) in libs.items():
dst = libdir / name
shutil.copy(src, dst, follow_symlinks=True)
# Some libraries are read-only, which prevents overriding the
# destination directory. Below, we change the permission of
# destination files to read-write (for the owner).
mode = dst.stat().st_mode
if not (mode & stat.S_IWUSR):
mode = mode | stat.S_IWUSR
dst.chmod(mode)
self.set_rpath(dst, '$ORIGIN')
# Patch RPATHs of binary modules.
path = Path(destination / f'{packages}/lib-dynload')
for module in glob.glob(str(path / "*.so")):
src = Path(module)
dst = os.path.relpath(libdir, src.parent)
self.set_rpath(src, f'$ORIGIN/{dst}')
# Patch RPATHs of Python runtime.
src = destination / runtime
dst = os.path.relpath(libdir, src.parent)
self.set_rpath(src, f'$ORIGIN/{dst}')
# Copy SSL certificates (i.e. clone certifi).
certs = self.prefix / 'opt/_internal/certs.pem'
if certs.is_symlink():
dst = self.prefix / str(certs.readlink())[1:]
certifi = dst.parent
assert(certifi.name == 'certifi')
site_packages = certifi.parent
assert(site_packages.name == 'site-packages')
for src in glob.glob(str(site_packages / 'certifi*')):
src = Path(src)
dst = destination / f'{packages}/site-packages/{src.name}'
if not dst.exists():
shutil.copytree(src, dst, symlinks=True)
else:
raise NotImplementedError()
# Copy Tcl & Tk data.
tcltk_src = self.prefix / 'usr/local/lib'
tx_version = []
for match in glob.glob(str(tcltk_src / 'tk*')):
path = Path(match)
if path.is_dir():
tx_version.append(LooseVersion(path.name[2:]))
tx_version.sort()
tx_version = tx_version[-1]
tcltk_dir = Path(destination / 'usr/share/tcltk')
tcltk_dir.mkdir(exist_ok=True, parents=True)
for tx in ('tcl', 'tk'):
name = f'{tx}{tx_version}'
src = tcltk_src / name
dst = tcltk_dir / name
shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
def ldd(self, target: Path) -> Dict[str, Path]:
'''Cross-platform implementation of ldd, using readelf.'''
pattern = re.compile(r'[(]NEEDED[)]\s+Shared library:\s+\[([^\]]+)\]')
dependencies = dict()
def recurse(target: Path):
result = subprocess.run(f'readelf -d {target}', shell=True,
check=True, capture_output=True)
stdout = result.stdout.decode()
matches = pattern.findall(stdout)
for match in matches:
if (match not in dependencies) and (match not in self.excluded):
path = self.locate_library(match)
dependencies[match] = path
subs = recurse(path)
recurse(target)
return dependencies
def locate_library(self, name: str) -> Path:
'''Locate a library given its qualified name.'''
for dirname in self.library_path:
path = dirname / name
if path.exists():
return path
else:
raise FileNotFoundError(name)
def set_rpath(self, target, rpath):
cmd = f'{self.patchelf} --print-rpath {target}'
result = subprocess.run(cmd, shell=True, check=True,
capture_output=True)
current_rpath = result.stdout.decode().strip()
if current_rpath != rpath:
cmd = f"{self.patchelf} --set-rpath '{rpath}' {target}"
subprocess.run(cmd, shell=True, check=True, capture_output=True)
@dataclass(frozen=True)
class ImageExtractor:
'''Manylinux image extractor from layers.'''
prefix: Path
'''Manylinux image prefix.'''
tag: Optional[str] = 'latest'
'''Manylinux image tag.'''
def extract(self, destination: Path):
'''Extract Manylinux image.'''
with open(self.prefix / f'tags/{self.tag}.json') as f:
meta = json.load(f)
layers = meta['layers']
for layer in layers:
debug('EXTRACT', f'{layer}.tar.gz')
filename = self.prefix / f'layers/{layer}.tar.gz'
cmd = ' && '.join((
f'mkdir -p {destination}',
f'tar -xzf {filename} -C {destination}',
f'chmod u+rw -R {destination}'
))
process = subprocess.run(cmd, shell=True, check=True,
capture_output=True)

View File

@@ -9,28 +9,30 @@ from .tmp import TemporaryDirectory
from .url import urlretrieve
__all__ = ['APPIMAGETOOL', 'EXCLUDELIST', 'PATCHELF', 'PREFIX',
'ensure_appimagetool', 'ensure_excludelist', 'ensure_patchelf']
_ARCH = platform.machine()
_CACHE_DIR = os.path.expanduser('~/.cache/python-appimage')
PREFIX = os.path.abspath(os.path.dirname(__file__) + '/..')
'''Package installation prefix'''
APPIMAGETOOL_DIR = os.path.expanduser('~/.local/bin')
APPIMAGETOOL_DIR = os.path.join(_CACHE_DIR, 'bin')
'''Location of the appimagetool binary'''
APPIMAGETOOL_VERSION = '12'
'''Version of the appimagetool binary'''
EXCLUDELIST = PREFIX + '/data/excludelist'
EXCLUDELIST = os.path.join(_CACHE_DIR, 'share/excludelist')
'''AppImage exclusion list'''
PATCHELF = os.path.expanduser('~/.local/bin/patchelf')
PATCHELF = os.path.join(_CACHE_DIR, 'bin/patchelf')
'''Location of the PatchELF binary'''
PATCHELF_VERSION = '0.14.3'
'''Version of the patchelf binary'''
def ensure_appimagetool(dry=False):
'''Fetch appimagetool from the web if not available locally
'''
@@ -91,19 +93,18 @@ def ensure_patchelf():
if os.path.exists(PATCHELF):
return False
iarch = 'i386' if _ARCH == 'i686' else _ARCH
appimage = 'patchelf-{0:}.AppImage'.format(iarch)
baseurl = 'https://github.com/niess/patchelf.appimage/releases/download'
tgz = '-'.join(('patchelf', _PATCHELF_VERSION, _ARCH)) + '.tar.gz'
baseurl = 'https://github.com/NixOS/patchelf'
log('INSTALL', 'patchelf from %s', baseurl)
dirname = os.path.dirname(PATCHELF)
patchelf = dirname + '/patchelf'
make_tree(dirname)
with TemporaryDirectory() as tmpdir:
urlretrieve(os.path.join(baseurl, 'rolling', appimage), appimage)
os.chmod(appimage, stat.S_IRWXU)
system(('./' + appimage, '--appimage-extract'))
copy_file('squashfs-root/usr/bin/patchelf', patchelf)
urlretrieve(os.path.join(baseurl, 'releases', 'download',
_PATCHELF_VERSION, tgz), tgz)
system(('tar', 'xzf', tgz))
copy_file('bin/patchelf', patchelf)
os.chmod(patchelf, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
return True

View File

@@ -32,6 +32,10 @@ def urlretrieve(url, filename=None):
else:
debug('DOWNLOAD', '%s as %s', url, filename)
parent_directory = os.path.dirname(filename)
if not os.path.exists(parent_directory):
os.makedirs(parent_directory)
if _urlretrieve is None:
data = urllib2.urlopen(url).read()
with open(filename, 'w') as f: