Quick Start¶
Get up and running with OpenML Crawler in minutes. This guide covers the most common use cases and basic functionality.
๐ Basic Usage¶
Loading Built-in Datasets¶
OpenML Crawler provides easy access to various data sources through built-in connectors:
from openmlcrawler import load_dataset
# Load weather data
weather_df = load_dataset("weather", location="New York", days=7)
print(f"Weather data shape: {weather_df.shape}")
print(weather_df.head())
# Load Twitter data
twitter_df = load_dataset("twitter", query="artificial intelligence", max_results=100)
print(f"Twitter data shape: {twitter_df.shape}")
# Load government data
gov_df = load_dataset("us_gov", query="education", limit=50)
print(f"Government data shape: {gov_df.shape}")
Crawling Web Data¶
Crawl data from any web source:
from openmlcrawler import crawl_and_prepare
# Crawl a CSV file
df = crawl_and_prepare(
source="https://example.com/data.csv",
type="csv",
label_column="target"
)
print(f"Crawled {len(df)} records")
Searching for Datasets¶
Find datasets across multiple platforms:
from openmlcrawler import search_open_data
# Search for datasets
results = search_open_data("climate change", max_results=10)
for result in results:
print(f"๐ {result['title']}")
print(f" ๐ {result['source']}")
print(f" ๐ {result['url']}")
print()
๐งน Data Processing¶
Basic Data Cleaning¶
from openmlcrawler import prepare_for_ml
# Load and clean data
df = load_dataset("weather", location="London", days=30)
# Prepare for ML
X, y, X_train, X_test, y_train, y_test = prepare_for_ml(
df,
target_column="temperature",
test_size=0.2,
normalize=True
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
Data Quality Assessment¶
from openmlcrawler import assess_data_quality
# Assess data quality
quality_report = assess_data_quality(df)
print("๐ Data Quality Report")
print(f"Completeness: {quality_report['completeness_score']:.2%}")
print(f"Missing rate: {quality_report['missing_rate']:.2%}")
print(f"Duplicate rate: {quality_report['duplicate_rate']:.2%}")
๐พ Exporting Data¶
Export to Different Formats¶
from openmlcrawler import export_dataset
# Export to CSV
export_dataset(df, "data.csv", format="csv")
# Export to JSON
export_dataset(df, "data.json", format="json")
# Export to Parquet
export_dataset(df, "data.parquet", format="parquet")
๐ง Command Line Interface¶
OpenML Crawler also provides a powerful CLI for automation:
# Load weather data
openmlcrawler load weather --location "Tokyo" --days 14 --output weather.csv
# Crawl a dataset
openmlcrawler crawl https://data.example.com/dataset.csv --type csv --output dataset.csv
# Search for datasets
openmlcrawler search "machine learning" --max-results 20
# Assess data quality
openmlcrawler quality dataset.csv --format json --output quality_report.json
# Export data
openmlcrawler export dataset.csv --format parquet --output dataset.parquet
๐ Advanced Examples¶
Social Media Analysis¶
# Analyze Twitter sentiment
twitter_df = load_dataset("twitter", query="python programming", max_results=500)
# Basic analysis
print(f"Average likes: {twitter_df['like_count'].mean():.1f}")
print(f"Most active language: {twitter_df['lang'].mode().iloc[0]}")
# Filter by engagement
high_engagement = twitter_df[twitter_df['like_count'] > twitter_df['like_count'].quantile(0.9)]
print(f"High engagement tweets: {len(high_engagement)}")
Government Data Integration¶
# Combine multiple government data sources
us_data = load_dataset("us_gov", query="healthcare", limit=100)
eu_data = load_dataset("eu_gov", query="healthcare", limit=100)
# Basic comparison
print(f"US datasets: {len(us_data)}")
print(f"EU datasets: {len(eu_data)}")
# Find common themes
us_themes = set(us_data['tags'].explode().dropna())
eu_themes = set(eu_data['tags'].explode().dropna())
common_themes = us_themes.intersection(eu_themes)
print(f"Common themes: {len(common_themes)}")
Time Series Analysis¶
# Load time series data
weather_df = load_dataset("weather", location="San Francisco", days=365)
# Basic time series analysis
weather_df['date'] = pd.to_datetime(weather_df['date'])
weather_df = weather_df.set_index('date')
# Monthly averages
monthly_avg = weather_df.resample('M')['temperature'].mean()
print("Monthly temperature averages:")
print(monthly_avg)
# Trend analysis
from sklearn.linear_model import LinearRegression
import numpy as np
X = np.arange(len(monthly_avg)).reshape(-1, 1)
y = monthly_avg.values
model = LinearRegression()
model.fit(X, y)
trend = model.coef_[0] * 365 # Annual trend
print(f"Temperature trend: {trend:.2f}ยฐC per year")
๐ Workflow Automation¶
YAML-Based Pipelines¶
Create a pipeline.yaml
file:
datasets:
- name: weather_data
connector: weather
params:
location: "Seattle"
days: 30
output: "weather_seattle.csv"
- name: twitter_data
connector: twitter
params:
query: "data science"
max_results: 200
output: "twitter_ds.csv"
- name: gov_data
connector: us_gov
params:
query: "technology"
limit: 50
output: "gov_tech.csv"
Run the pipeline:
from openmlcrawler import execute_workflow_from_file
# Execute the workflow
result = execute_workflow_from_file("pipeline.yaml")
print(f"Workflow status: {result['status']}")
Custom Processing Pipeline¶
from openmlcrawler.core.self_healing import SelfHealingPipeline
from openmlcrawler.core.quality import assess_data_quality
# Create a self-healing pipeline
pipeline = SelfHealingPipeline(
max_retries=3,
retry_delay=1.0,
enable_anomaly_detection=True
)
@pipeline.with_self_healing
def process_dataset(source_url):
"""Process dataset with automatic error recovery."""
# Load data
df = crawl_and_prepare(source_url, type="csv")
# Assess quality
quality = assess_data_quality(df)
if quality['completeness_score'] < 0.8:
raise ValueError("Data quality too low")
# Clean and prepare
X, y, X_train, X_test, y_train, y_test = prepare_for_ml(
df, target_column="target", test_size=0.2
)
return {
'X_train': X_train,
'y_train': y_train,
'quality_score': quality['completeness_score']
}
# Execute with automatic retry and fallback
result = process_dataset("https://example.com/data.csv")
print(f"Processing completed with quality score: {result['quality_score']:.2%}")
๐ Monitoring and Logging¶
Real-time Monitoring¶
from openmlcrawler.core.monitoring import create_real_time_monitor
# Create monitor
monitor = create_real_time_monitor()
# Configure features to monitor
monitor.set_feature_columns(['temperature', 'humidity', 'pressure'])
# Start monitoring
monitor.start_monitoring()
# Process data stream
for data_point in data_stream:
result = monitor.process_data_point(data_point)
if result['anomaly_detected']:
print(f"๐จ Anomaly detected: {result['anomaly_score']:.3f}")
# Get monitoring statistics
stats = monitor.get_monitoring_status()
print(f"Processed {stats['total_points']} data points")
print(f"Anomalies detected: {stats['anomaly_count']}")
๐ฏ Best Practices¶
Data Quality¶
- Always assess data quality before using datasets
- Handle missing values appropriately for your use case
- Check for duplicates and remove them
- Validate data types and ranges
Performance¶
- Use appropriate data types to save memory
- Filter data early in your pipeline
- Cache frequently used datasets
- Use streaming for large datasets
Error Handling¶
- Use self-healing pipelines for robust data processing
- Implement proper logging for debugging
- Handle API rate limits gracefully
- Validate data at each processing step
๐ Next Steps¶
- Explore Connectors: Learn about all available data sources
- Advanced Features: Discover ML integration and cloud storage
- Tutorials: Follow step-by-step guides for specific use cases
- API Reference: Understand all available functions and classes
๐ Getting Help¶
- ๐ Full Documentation
- ๐ Report Issues
- ๐ฌ Community Discussions
- ๐ง Support Email