Advanced Media Converter
A comprehensive audio processing application built with Python that combines detailed analysis, visualization, format conversion, and batch processing capabilities with a clean, user-friendly interface.
The Real Problem
Audio processing workflows typically involve multiple fragmented steps and tools:
- Format Fragmentation: Different tools are needed for conversion, analysis, and editing
- Command-Line Complexity: Many powerful tools require complex command-line usage
- Analysis Limitations: Basic converters lack meaningful audio analysis capabilities
- Batch Processing Challenges: Processing multiple files with consistent settings is tedious
- Visualization Gaps: Seeing waveforms while editing requires separate specialized tools
The Technical Solution
The Advanced Media Converter addresses these challenges through a modular architecture with specialized components:
class MediaConverter:
"""Main application class for media conversion and audio processing"""
def __init__(self):
self.input_path: Optional[Path] = None
self.output_path: Optional[Path] = None
self.current_audio_file: Optional[Path] = None
self.stop_event = Event()
# Initialize components
self.analyzer = AudioAnalyzer()
self.trimmer = AudioTrimmer()
self.batch_processor = BatchProcessor(self)
# Format options
self.format_options = {
'WAV': {
'extension': 'wav',
'codec': 'pcm_s16le',
'bitrates': ['1411k'],
'sample_rates': ['44100', '48000', '96000']
},
'MP3': {
'extension': 'mp3',
'codec': 'libmp3lame',
'bitrates': ['320k', '256k', '192k', '128k'],
'sample_rates': ['44100', '48000']
},
'AAC': {
'extension': 'm4a',
'codec': 'aac',
'bitrates': ['256k', '192k', '128k'],
'sample_rates': ['44100', '48000']
},
'FLAC': {
'extension': 'flac',
'codec': 'flac',
'bitrates': ['0'], # Lossless
'sample_rates': ['44100', '48000', '96000']
}
}
1. Advanced Audio Analysis
The application includes comprehensive audio analysis capabilities:
def analyze_audio(self, file_path: Path) -> bool:
"""
Perform detailed audio analysis
Args:
file_path: Path to audio file
Returns:
bool: True if analysis successful
Raises:
MediaConverterError: If analysis fails
"""
try:
self.logger.info(f"Analyzing audio file: {file_path}")
# Use soundfile first, fall back to librosa
try:
import soundfile as sf
data, sr = sf.read(str(file_path))
except Exception:
# If soundfile fails, use librosa with default parameters
import librosa
data, sr = librosa.load(str(file_path), sr=None) # sr=None preserves original sample rate
# Convert to mono if stereo
if len(data.shape) > 1:
data = librosa.to_mono(data)
# Basic audio properties
duration = librosa.get_duration(y=data, sr=sr)
tempo, beats = librosa.beat.beat_track(y=data, sr=sr)
# Audio features
rmse = librosa.feature.rms(y=data)[0]
spectral_centroid = librosa.feature.spectral_centroid(y=data, sr=sr)[0]
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=data, sr=sr)[0]
spectral_rolloff = librosa.feature.spectral_rolloff(y=data, sr=sr)[0]
zero_crossing_rate = librosa.feature.zero_crossing_rate(data)[0]
# Silence detection with adjustable parameters
intervals = librosa.effects.split(
data,
top_db=20, # Adjust this value for silence detection sensitivity
frame_length=2048,
hop_length=512
)
# Harmonic and percussive components
y_harmonic, y_percussive = librosa.effects.hpss(data)
# Mel spectrogram with adjusted parameters
mel_spec = librosa.feature.melspectrogram(
y=data,
sr=sr,
n_mels=128,
fmax=sr/2
)
self.audio_info = {
'duration': float(duration),
'sample_rate': int(sr),
'tempo': float(tempo),
'beat_frames': beats.tolist(),
'audio_features': {
'avg_volume': float(np.mean(rmse)),
'max_volume': float(np.max(rmse)),
'spectral_centroid_mean': float(np.mean(spectral_centroid)),
'spectral_bandwidth_mean': float(np.mean(spectral_bandwidth)),
'spectral_rolloff_mean': float(np.mean(spectral_rolloff)),
'zero_crossing_rate_mean': float(np.mean(zero_crossing_rate)),
'perceived_loudness': float(np.mean(librosa.power_to_db(rmse)))
},
'silent_intervals': intervals.tolist(),
'components': {
'harmonic_mean': float(np.mean(np.abs(y_harmonic))),
'percussive_mean': float(np.mean(np.abs(y_percussive)))
},
'mel_spectrogram_mean': float(np.mean(mel_spec)),
'file_info': {
'channels': 1 if len(data.shape) == 1 else data.shape[1],
'file_path': str(file_path),
'format': file_path.suffix[1:].upper()
}
}
2. Interactive Waveform Visualization
The application provides advanced waveform visualization with interactive capabilities:
class EnhancedWaveformVisualizer(WaveformVisualizer):
"""Enhanced waveform visualization with interactive features"""
def __init__(self, canvas_frame: ttk.Frame):
super().__init__(canvas_frame)
self.setup_interactive_elements()
self.logger = logging.getLogger(__name__ + '.EnhancedWaveformVisualizer')
def setup_interactive_elements(self) -> None:
"""Setup interactive visualization controls"""
self.zoom_level = 1.0
self.view_start = 0
self.view_end = 1.0
self.duration = 0
# Enable mouse wheel zoom
self.canvas.get_tk_widget().bind('<MouseWheel>', self.handle_zoom)
# Enable drag to pan
self.canvas.get_tk_widget().bind('<B1-Motion>', self.handle_pan)
# Setup selection span
self.span = SpanSelector(
self.plot, self.on_select, 'horizontal',
useblit=True, props=dict(alpha=0.5, facecolor='tab:blue')
)
3. Intelligent Audio Trimming
The application includes specialized audio trimming capabilities:
def trim_audio(self, input_path: Path, output_path: Path,
start_time: float, end_time: float,
fade_duration: float = 0.1) -> bool:
"""
Trim audio file with optional fade in/out
Args:
input_path: Source audio file path
output_path: Output file path
start_time: Start time in seconds
end_time: End time in seconds
fade_duration: Duration of fade in/out in seconds
Returns:
bool: True if trimming successful
Raises:
MediaConverterError: If trimming fails
"""
try:
self.logger.info(f"Trimming audio: {input_path} -> {output_path}")
# Build FFmpeg command with fade
command = [
'ffmpeg',
'-i', str(input_path),
'-ss', str(start_time),
'-to', str(end_time),
'-af', f'afade=t=in:st={start_time}:d={fade_duration},'
f'afade=t=out:st={end_time-fade_duration}:d={fade_duration}',
'-y',
str(output_path)
]
# Run FFmpeg
process = subprocess.run(
command,
capture_output=True,
text=True
)
if process.returncode != 0:
raise MediaConverterError(f"FFmpeg error: {process.stderr}")
self.logger.info("Trimming completed successfully")
return True
except Exception as e:
self.logger.error(f"Trimming failed: {str(e)}", exc_info=True)
raise MediaConverterError(f"Trimming failed: {str(e)}")
4. Efficient Batch Processing
The application includes a queue-based batch processing system:
class BatchProcessor:
"""Handles batch processing of audio files"""
def __init__(self, converter):
self.converter = converter
self.batch_settings: Dict[Path, Dict] = {}
self.queue = Queue()
self.stop_event = Event()
self.logger = logging.getLogger(__name__ + '.BatchProcessor')
def process_batch(self) -> None:
"""
Process all files in batch
Raises:
MediaConverterError: If batch processing fails
"""
try:
self.logger.info("Starting batch processing")
self.stop_event.clear()
total = len(self.batch_settings)
for i, (file_path, settings) in enumerate(self.batch_settings.items(), 1):
if self.stop_event.is_set():
self.logger.info("Batch processing cancelled")
self.queue.put(('status', "Batch processing cancelled"))
break
try:
self.logger.info(f"Processing file {i}/{total}: {file_path}")
self.queue.put(('status', f"Processing: {file_path.name}"))
# Process file with current settings
self.converter.process_file(file_path, settings)
# Update progress
progress = (i / total) * 100
self.queue.put(('progress', progress))
self.queue.put(('log', f"Completed: {file_path.name}"))
except Exception as e:
self.logger.error(f"Error processing {file_path}: {str(e)}",
exc_info=True)
self.queue.put(('error', f"Error processing {file_path}: {str(e)}"))
continue
5. Comprehensive Error Handling
The application implements robust error handling throughout:
class MediaConverterError(Exception):
"""Custom exception for media conversion errors"""
pass
# Error handling pattern used throughout the application
try:
# Processing logic...
except Exception as e:
self.logger.error(f"Processing failed: {str(e)}", exc_info=True)
raise MediaConverterError(f"Processing failed: {str(e)}")
Advanced Features
1. Multi-Format Support
The application supports multiple audio formats with appropriate configuration:
# Format options
self.format_options = {
'WAV': {
'extension': 'wav',
'codec': 'pcm_s16le',
'bitrates': ['1411k'],
'sample_rates': ['44100', '48000', '96000']
},
'MP3': {
'extension': 'mp3',
'codec': 'libmp3lame',
'bitrates': ['320k', '256k', '192k', '128k'],
'sample_rates': ['44100', '48000']
},
'AAC': {
'extension': 'm4a',
'codec': 'aac',
'bitrates': ['256k', '192k', '128k'],
'sample_rates': ['44100', '48000']
},
'FLAC': {
'extension': 'flac',
'codec': 'flac',
'bitrates': ['0'], # Lossless
'sample_rates': ['44100', '48000', '96000']
}
}
2. Silence Detection
The application identifies silent sections in audio for cleaning and editing:
def detect_silence(self) -> None:
"""Detect and mark silent intervals"""
try:
if not self.analyzer.audio_info:
raise MediaConverterError("No analysis data available")
silence_regions = self.analyzer.get_silence_regions()
if not silence_regions:
self.log_message("No silent regions detected")
return
# Update analysis display
self.analysis_text.insert(tk.END, "\nSilent Regions:\n--------------\n")
for i, region in enumerate(silence_regions, 1):
self.analysis_text.insert(
tk.END,
f"Region {i}: {self.format_time(region['start'])} - "
f"{self.format_time(region['end'])}\n"
)
# Mark silent regions on waveform if in advanced mode
if self.trim_mode.get() == "advanced":
for region in silence_regions:
self.waveform_visualizer.plot.axvspan(
region['start'],
region['end'],
alpha=0.3,
color='red'
)
self.waveform_visualizer.canvas.draw()
3. File Format Flexibility
The application intelligently handles different audio file formats:
def load_audio_file(self, file_path: Path) -> bool:
"""
Load and analyze audio file
Args:
file_path: Path to audio file
Returns:
bool: True if loading successful
Raises:
MediaConverterError: If loading fails
"""
try:
self.logger.info(f"Loading audio file: {file_path}")
# Use different loading methods based on file extension
suffix = file_path.suffix.lower()
if suffix == '.wav':
return self._load_wav_file(file_path)
elif suffix in ['.mp3', '.m4a', '.aac', '.flac', '.ogg']:
return self._load_pydub_file(file_path)
else:
return self._load_generic_audio(file_path)
except Exception as e:
self.logger.error(f"Error loading audio file: {str(e)}", exc_info=True)
raise MediaConverterError(f"Error loading audio file: {str(e)}")
Modular UI Design
The application follows a modular UI design with separate sections for different functionality:
def setup_ui(self) -> None:
"""Initialize the main UI window"""
self.root = tk.Tk()
self.root.title("Enhanced Media Converter")
self.root.geometry("800x900")
# Create main container
main_container = ttk.Frame(self.root, padding="10")
main_container.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
# Setup UI sections
self.setup_format_section(main_container)
self.setup_path_section(main_container)
self.setup_trim_section(main_container)
self.setup_analysis_section(main_container)
self.setup_batch_section(main_container)
self.setup_progress_section(main_container)
self.setup_log_section(main_container)
Implementation Challenges & Solutions
Building this application required solving several technical challenges:
1. Multi-Format Audio Processing
Challenge: Different audio formats require different loading and processing approaches.
Solution: Implemented a format-specific loading system with fallbacks:
def _load_pydub_file(self, file_path: Path) -> bool:
"""Load audio file using pydub"""
from pydub import AudioSegment
try:
# Load audio file
audio = AudioSegment.from_file(str(file_path))
# Get audio properties
self.channels = audio.channels
self.sample_width = audio.sample_width
self.sample_rate = audio.frame_rate
self.duration = len(audio) / 1000.0 # Convert ms to seconds
# Convert to numpy array
samples = audio.get_array_of_samples()
self.waveform_data = np.array(samples)
# Reshape if stereo
if self.channels == 2:
self.waveform_data = self.waveform_data.reshape(-1, 2)
# Normalize to float between -1 and 1
self.waveform_data = self.waveform_data.astype(np.float32)
max_val = float(1 << (8 * self.sample_width - 1))
self.waveform_data /= max_val
return True
except Exception as e:
self.logger.error(f"Pydub loading failed: {str(e)}", exc_info=True)
raise MediaConverterError(f"Failed to load audio with pydub: {str(e)}")
2. Efficient Waveform Visualization
Challenge: Displaying waveforms for large audio files while maintaining responsiveness.
Solution: Implemented zoom and pan functionality with efficient redraws:
def handle_zoom(self, event) -> None:
"""Handle mouse wheel zoom"""
try:
if event.delta > 0:
self.zoom_level *= 1.1
else:
self.zoom_level /= 1.1
self.zoom_level = np.clip(self.zoom_level, 1.0, 50.0)
self.update_view()
except Exception as e:
self.logger.error(f"Zoom operation failed: {str(e)}", exc_info=True)
def handle_pan(self, event) -> None:
"""Handle mouse drag panning"""
try:
if hasattr(self, 'last_x'):
dx = event.x - self.last_x
delta = dx * (self.view_end - self.view_start) / self.canvas.get_tk_widget().winfo_width()
self.view_start = max(0, self.view_start - delta)
self.view_end = min(1.0, self.view_end - delta)
self.update_view()
self.last_x = event.x
except Exception as e:
self.logger.error(f"Pan operation failed: {str(e)}", exc_info=True)
3. Thread-Safe UI Updates
Challenge: Performing background processing while keeping the UI responsive.
Solution: Implemented a message queue system for thread communication:
def check_queue(self) -> None:
"""Check for updates from the conversion thread"""
try:
while True:
msg_type, data = self.queue.get_nowait()
if msg_type == 'progress':
self.progress_var.set(data)
elif msg_type == 'status':
self.status_var.set(data)
elif msg_type == 'log':
self.log_message(data)
elif msg_type == 'error':
messagebox.showerror("Error", data)
self.convert_btn.state(['!disabled'])
self.cancel_btn.state(['disabled'])
elif msg_type == 'done':
self.convert_btn.state(['!disabled'])
self.cancel_btn.state(['disabled'])
break
except Exception:
# Queue is empty, schedule next check
self.root.after(100, self.check_queue)
4. FFmpeg Integration
Challenge: Integrating FFmpeg for professional-grade audio processing without complex command-line usage.
Solution: Wrapped FFmpeg with a clean Python interface:
def process_file(self, file_path: Path, settings: Optional[Dict] = None) -> None:
"""
Process a single file with conversion and trimming
Args:
file_path: Path to input file
settings: Optional dictionary of processing settings
"""
try:
# Use provided settings or current UI settings
if settings is None:
settings = {
'start_time': self.start_time_var.get(),
'end_time': self.end_time_var.get(),
'format': self.format_var.get(),
'bitrate': self.bitrate_var.get(),
'sample_rate': self.sample_rate_var.get(),
'channels': self.channel_var.get()
}
format_info = self.format_options[settings['format']]
output_file = self.output_path / f"{file_path.stem}.{format_info['extension']}"
# Get file duration
file_duration = self.get_file_duration(file_path)
# Build FFmpeg command
command = [
'ffmpeg',
'-i', str(file_path)
]
# Add trim parameters if needed
start_time = self.parse_time(settings['start_time'])
end_time = self.parse_time(settings['end_time'])
# Validate trim times
if start_time >= file_duration:
start_time = 0
if end_time <= start_time or end_time > file_duration:
end_time = file_duration
# Only add trim parameters if they're valid
if start_time > 0 or end_time < file_duration:
command.extend([
'-ss', str(start_time),
'-t', str(end_time - start_time) # Use duration instead of end point
])
# Add conversion parameters
command.extend([
'-acodec', format_info['codec'],
'-ar', settings['sample_rate'],
'-ac', settings['channels'],
'-b:a', settings['bitrate'],
'-y', # Overwrite output
str(output_file)
])
# Run conversion
process = subprocess.run(
command,
capture_output=True,
text=True
)
if process.returncode != 0:
raise MediaConverterError(f"FFmpeg error: {process.stderr}")
except Exception as e:
self.logger.error(f"Processing failed: {str(e)}", exc_info=True)
raise MediaConverterError(f"Processing failed: {str(e)}")
Technical Skills Demonstrated
This project showcases advanced Python programming skills including:
- GUI Development: Building responsive and complex Tkinter interfaces
- Multithreading: Thread-safe background processing and UI updates
- Audio Processing: Advanced audio analysis and manipulation techniques
- Process Management: Subprocess management and output handling for FFmpeg
- Error Handling: Comprehensive error handling with custom exceptions
- Data Visualization: Interactive visualization of audio waveforms
- OOP Design: Clean object-oriented architecture with proper encapsulation
Real Impact
The application provides tangible benefits for audio processing workflows:
- Unified Interface: Combines multiple audio tools in one application
- Time Savings: Batch processing capabilities for efficient workflows
- Visual Insight: Interactive waveform display for precise editing
- Format Flexibility: Supports multiple formats with appropriate settings
- Detailed Analysis: Advanced audio analysis for deeper understanding
- Error Prevention: Robust error handling reduces processing failures
The Advanced Media Converter demonstrates how a well-designed application can transform complex audio workflows into a streamlined, user-friendly experience while providing professional-grade capabilities.