Slide 1: Pure Python Project Setup
Understanding proper Python project structure is crucial for maintainable codebases. We'll create a modern Python project with virtual environments, dependencies management, and proper package organization that demonstrates the advantages of Pure Python over distributions.
# Project structure setup
project_root/
├── src/
│ ├── __init__.py
│ └── main.py
├── tests/
│ ├── __init__.py
│ └── test_main.py
├── requirements.txt
├── setup.py
└── venv/
# Terminal commands
python -m venv venv
source venv/bin/activate # Unix
.\venv\Scripts\activate # Windows
# requirements.txt
numpy==1.21.0
pandas==1.3.0
scikit-learn==0.24.2
Slide 2: Virtual Environment Management
Pure Python's virtual environments provide isolated package management per project, preventing dependency conflicts and ensuring reproducible environments. This approach is more explicit and controllable than Anaconda's global package management.
# Create and manage virtual environment
import subprocess
import sys
def setup_project_env():
# Create virtual environment
subprocess.run([sys.executable, "-m", "venv", "venv"])
# Install requirements
if sys.platform == "win32":
pip_path = "venv\\Scripts\\pip"
else:
pip_path = "venv/bin/pip"
subprocess.run([pip_path, "install", "-r", "requirements.txt"])
if __name__ == "__main__":
setup_project_env()
Slide 3: Dependency Management
Pure Python's pip package manager allows precise control over project dependencies. This example demonstrates how to manage, freeze, and install dependencies while maintaining minimal footprint compared to Anaconda's bulk installation.
import pkg_resources
import subprocess
from pathlib import Path
def manage_dependencies():
# Get installed packages
installed = {pkg.key: pkg.version for pkg
in pkg_resources.working_set}
# Save current environment
with open('requirements.txt', 'w') as f:
for package, version in installed.items():
f.write(f"{package}=={version}\n")
# Install specific version
subprocess.run(["pip", "install",
"numpy==1.21.0", "--no-cache-dir"])
return installed
print(manage_dependencies())
Slide 4: Project-Specific Package Installation
In Pure Python, packages are installed within the project's virtual environment, maintaining isolation. This script demonstrates package management and verification within a specific project context.
import site
import os
from pathlib import Path
def verify_package_location():
# Get virtual environment site-packages
venv_path = Path(site.getsitepackages()[0])
# List installed packages
packages = [p for p in venv_path.glob("*-info")]
# Package installation verification
def is_package_local(package_name):
return any(p.name.startswith(package_name)
for p in packages)
packages_status = {
"numpy": is_package_local("numpy"),
"pandas": is_package_local("pandas"),
"scikit-learn": is_package_local("scikit_learn")
}
return packages_status
print(verify_package_location())
Slide 5: Minimal Build Process
Pure Python enables creating lightweight, production-ready builds. This implementation shows how to create a minimal package distribution without unnecessary dependencies, reducing deployment costs and complexity.
from setuptools import setup, find_packages
import json
def create_minimal_build():
# Read project dependencies
with open('requirements.txt') as f:
required = f.read().splitlines()
# Define package metadata
setup(
name="ml_project",
version="0.1.0",
packages=find_packages(where="src"),
package_dir={"": "src"},
install_requires=required,
python_requires=">=3.8",
)
# Calculate package size
package_info = {
"dependencies": len(required),
"packages": len(find_packages(where="src"))
}
return package_info
print(create_minimal_build())
Slide 6: Data Science Project Structure
Pure Python allows for a cleaner, more organized data science project structure. This implementation demonstrates how to set up a machine learning project with proper separation of concerns and minimal dependencies.
from pathlib import Path
import json
def create_ds_project():
# Create project structure
structure = {
"data": ["raw", "processed", "interim"],
"models": ["trained", "evaluations"],
"notebooks": [],
"src": ["data", "features", "models", "visualization"]
}
for directory, subdirs in structure.items():
base_dir = Path(directory)
base_dir.mkdir(exist_ok=True)
for subdir in subdirs:
(base_dir / subdir).mkdir(exist_ok=True)
if directory == "src":
(base_dir / "__init__.py").touch()
for subdir in subdirs:
(base_dir / subdir / "__init__.py").touch()
return structure
print(json.dumps(create_ds_project(), indent=2))
Slide 7: Custom Environment Configuration
Managing environment configurations in Pure Python provides greater flexibility and control compared to Anaconda's approach. This implementation shows how to handle different environments efficiently.
import yaml
import os
from typing import Dict, Any
class EnvironmentConfig:
def __init__(self, env_name: str):
self.env_name = env_name
self.config = self._load_config()
def _load_config(self) -> Dict[str, Any]:
config_path = f"config/{self.env_name}.yaml"
if not os.path.exists(config_path):
return self._create_default_config()
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def _create_default_config(self) -> Dict[str, Any]:
config = {
"data_path": "data/",
"model_path": "models/",
"log_level": "INFO",
"max_workers": 4
}
os.makedirs("config", exist_ok=True)
with open(f"config/{self.env_name}.yaml", 'w') as f:
yaml.dump(config, f)
return config
# Usage example
dev_config = EnvironmentConfig("development")
print(dev_config.config)
Slide 8: Efficient Package Management
Pure Python's pip allows for precise control over package versions and dependencies. This script demonstrates efficient package management and version control.
import subprocess
import json
from typing import Dict, List
class PackageManager:
@staticmethod
def get_installed_packages() -> Dict[str, str]:
result = subprocess.run(
["pip", "list", "--format=json"],
capture_output=True,
text=True
)
return {
pkg["name"]: pkg["version"]
for pkg in json.loads(result.stdout)
}
@staticmethod
def check_dependencies(requirements_file: str) -> List[str]:
with open(requirements_file, 'r') as f:
required = f.read().splitlines()
installed = PackageManager.get_installed_packages()
missing = []
for req in required:
package = req.split('==')[0]
if package not in installed:
missing.append(package)
return missing
# Usage example
pkg_manager = PackageManager()
print("Installed packages:", pkg_manager.get_installed_packages())
print("Missing dependencies:",
pkg_manager.check_dependencies("requirements.txt"))
Slide 9: Advanced Model Development Setup
Demonstrating how Pure Python enables clean machine learning model development with minimal dependencies while maintaining full control over the development environment.
from pathlib import Path
from typing import Optional, Dict, Any
import pickle
import json
import time
class MLProject:
def __init__(self, project_name: str):
self.project_name = project_name
self.project_path = Path(f"projects/{project_name}")
self._setup_project()
def _setup_project(self):
# Create project directories
dirs = ["models", "data", "logs", "configs"]
for dir_name in dirs:
(self.project_path / dir_name).mkdir(parents=True,
exist_ok=True)
def save_model(self, model: Any,
model_name: str,
metadata: Optional[Dict] = None):
model_path = self.project_path / "models" / f"{model_name}.pkl"
meta_path = self.project_path / "models" / f"{model_name}_meta.json"
# Save model
with open(model_path, 'wb') as f:
pickle.dump(model, f)
# Save metadata
if metadata is None:
metadata = {}
metadata.update({
"saved_at": time.time(),
"model_name": model_name
})
with open(meta_path, 'w') as f:
json.dump(metadata, f, indent=2)
return model_path, meta_path
# Usage example
project = MLProject("classification_project")
dummy_model = {"type": "random_forest"}
model_path, meta_path = project.save_model(
dummy_model,
"rf_classifier",
{"accuracy": 0.95}
)
print(f"Model saved at: {model_path}")
print(f"Metadata saved at: {meta_path}")
Slide 10: Production Deployment Setup
Pure Python's lightweight nature makes it ideal for production deployments. This example shows how to prepare a model for production while maintaining minimal dependencies.
from typing import Dict, Any
import json
import hashlib
import datetime
class ProductionDeployment:
def __init__(self, model_name: str):
self.model_name = model_name
self.deployment_info = self._init_deployment_info()
def _init_deployment_info(self) -> Dict[str, Any]:
return {
"model_name": self.model_name,
"deployment_id": self._generate_deployment_id(),
"deployment_date": datetime.datetime.now().isoformat(),
"dependencies": self._get_dependencies(),
"status": "initialized"
}
def _generate_deployment_id(self) -> str:
timestamp = datetime.datetime.now().isoformat()
return hashlib.md5(
f"{self.model_name}_{timestamp}".encode()
).hexdigest()[:12]
def _get_dependencies(self) -> Dict[str, str]:
with open("requirements.txt", 'r') as f:
deps = {}
for line in f:
if "==" in line:
name, version = line.strip().split("==")
deps[name] = version
return deps
def prepare_deployment(self) -> Dict[str, Any]:
self.deployment_info["status"] = "ready"
self._save_deployment_config()
return self.deployment_info
def _save_deployment_config(self):
config_path = f"deployments/{self.deployment_info['deployment_id']}.json"
with open(config_path, 'w') as f:
json.dump(self.deployment_info, f, indent=2)
# Usage example
deployment = ProductionDeployment("sentiment_analyzer")
deployment_info = deployment.prepare_deployment()
print(json.dumps(deployment_info, indent=2))
Slide 11: Performance Monitoring Setup
A crucial advantage of Pure Python is the ability to implement lightweight yet powerful monitoring systems. This implementation shows how to track model performance and resource usage efficiently.
import time
import psutil
import json
from datetime import datetime
from typing import Dict, List
class PerformanceMonitor:
def __init__(self, model_name: str):
self.model_name = model_name
self.metrics: List[Dict] = []
def capture_metrics(self, prediction_count: int) -> Dict:
cpu_percent = psutil.cpu_percent(interval=1)
memory_info = psutil.Process().memory_info()
metrics = {
"timestamp": datetime.now().isoformat(),
"model_name": self.model_name,
"cpu_percent": cpu_percent,
"memory_mb": memory_info.rss / (1024 * 1024),
"prediction_count": prediction_count,
}
self.metrics.append(metrics)
return metrics
def save_metrics(self, filepath: str):
with open(filepath, 'w') as f:
json.dump({
"model_name": self.model_name,
"metrics": self.metrics
}, f, indent=2)
# Usage example
monitor = PerformanceMonitor("text_classifier")
for i in range(3):
metrics = monitor.capture_metrics(100 * (i + 1))
print(f"Captured metrics: {metrics}")
time.sleep(1)
monitor.save_metrics("performance_log.json")
Slide 12: Automated Testing Framework
Pure Python enables creation of comprehensive testing frameworks without unnecessary dependencies. This implementation demonstrates how to set up automated testing for machine learning models.
import unittest
from typing import Any, Dict, List
import numpy as np
from pathlib import Path
class MLModelTest(unittest.TestCase):
def setUp(self):
self.test_data_path = Path("tests/test_data")
self.test_data_path.mkdir(parents=True, exist_ok=True)
def generate_test_data(self,
n_samples: int = 1000) -> Dict[str, np.ndarray]:
np.random.seed(42)
X = np.random.randn(n_samples, 10)
y = np.random.randint(0, 2, n_samples)
return {"X": X, "y": y}
def test_model_predictions(self):
class DummyModel:
def predict(self, X):
return np.ones(len(X))
model = DummyModel()
test_data = self.generate_test_data()
predictions = model.predict(test_data["X"])
self.assertEqual(len(predictions), len(test_data["X"]))
self.assertTrue(np.all(predictions >= 0))
self.assertTrue(np.all(predictions <= 1))
def test_model_performance(self):
def calculate_metrics(y_true, y_pred) -> Dict[str, float]:
accuracy = np.mean(y_true == y_pred)
return {"accuracy": accuracy}
test_data = self.generate_test_data()
dummy_predictions = np.ones(len(test_data["y"]))
metrics = calculate_metrics(test_data["y"], dummy_predictions)
self.assertGreater(metrics["accuracy"], 0)
self.assertLess(metrics["accuracy"], 1)
if __name__ == '__main__':
unittest.main(argv=['first-arg-is-ignored'], exit=False)
Slide 13: Experimental Results Tracking
Pure Python allows for efficient tracking of machine learning experiments without the overhead of additional frameworks. This implementation provides a clean way to log and compare experimental results.
from datetime import datetime
import json
from typing import Dict, List, Optional
import hashlib
class ExperimentTracker:
def __init__(self, project_name: str):
self.project_name = project_name
self.experiments: List[Dict] = []
def log_experiment(self,
model_params: Dict,
metrics: Dict,
dataset_info: Optional[Dict] = None) -> str:
experiment_id = self._generate_experiment_id()
experiment = {
"experiment_id": experiment_id,
"timestamp": datetime.now().isoformat(),
"model_parameters": model_params,
"metrics": metrics,
"dataset_info": dataset_info or {},
"project_name": self.project_name
}
self.experiments.append(experiment)
self._save_experiment(experiment)
return experiment_id
def _generate_experiment_id(self) -> str:
timestamp = datetime.now().isoformat()
unique_string = f"{self.project_name}_{timestamp}"
return hashlib.md5(unique_string.encode()).hexdigest()[:8]
def _save_experiment(self, experiment: Dict):
filename = f"experiments/{experiment['experiment_id']}.json"
with open(filename, 'w') as f:
json.dump(experiment, f, indent=2)
def get_best_experiment(self,
metric_name: str,
higher_is_better: bool = True) -> Dict:
sorted_experiments = sorted(
self.experiments,
key=lambda x: x["metrics"][metric_name],
reverse=higher_is_better
)
return sorted_experiments[0]
# Usage example
tracker = ExperimentTracker("text_classification")
experiment_id = tracker.log_experiment(
model_params={"learning_rate": 0.01, "max_depth": 5},
metrics={"accuracy": 0.92, "f1_score": 0.90},
dataset_info={"size": 10000, "features": 100}
)
print(f"Logged experiment: {experiment_id}")
best_exp = tracker.get_best_experiment("accuracy")
print(f"Best experiment: {best_exp}")
Slide 14: Additional Resources
- "Reproducible Machine Learning with Pure Python"
- Search on Google Scholar for: "Python Environment Management in Production ML Systems"
- "Efficient Model Deployment Strategies"
- "Best Practices for ML Production Systems"
- "Scalable Machine Learning Pipeline Design"
- Search for: "MLOps Best Practices with Python" on Google Scholar
- "Minimalistic Approaches to Large Scale ML Systems"
- Search for: "Lightweight ML Systems Design" on Google Scholar