Python  -:    
 


    . ,    .  ,    . ,      .              .

         :  , , U-Net, attention, Tacotron.           ,  ,  ,       .

 .   .  ,   .  ,      ,       .





 

Python  -:    








    

      .   ,  ,  ,  ,   .    ,   ,       .   -.   .

     .   ,     .      .     .  ,       .    .   -. ,   .

     .  ,    .        .    , ,       .     .          ,      .    ,      .       .

  .         .   ,  ,       .  ,        ,   .  ,    ,   ,     .  .    .           ,   WaveNet.

      .    ,         .   ,    Python              .

   

   ,         Python  :          Python:    .       .  ,   ,   .     .

   ,       ,     Python       .  ,    ,   .     , ,      .        .

   ,   ,    ,       iZotope RX, Adobe Podcast, ElevenLabs,    ,   ,    .     .           .

   ,   ,         .       ,  ,   ,       .           .

   

    .          .      Python,  ,        .      ,   .            .

        :   .  ,         ,  .    ,       , ,         .

     ,         .       :    ,       .

        ,       .   ,     ,   .     Demucs     .

      .            ,   .

       .  ,        .          .

      .      ,  , ,   .    ,     .

          .       ,    ,     .

     .    ,  :   ,      ,  ,     .  ,     , ,      .

      ,     .     .      ,     , .   .       PyTorch,     .   :  , , .    :  ,    .    :     .     .

  PyTorch

      .   PyTorch. ?     .   PyTorch    Python.    print()     ,    .       .    .

TensorFlow   ,   API  ,   .     ,   ,      ,  PyTorch  .

      PyTorch   .          . ,      ,   ,     .

  

  Python 3.9     : torch, torchaudio, librosa, numpy, soundfile, matplotlib.     pip  .       .

    Python       ,     .        ,        .                         .

    ,       .        .   .      ,  :      ,     . ,     ,    .

,  ,   .      ,   ,     .




   


     2025 .       .     ,    ,  .   ,      , -   .

  ,   .  .  ,     .     .      .        ,     ,   ,     .

    .    .       ,      .

  ,     - .         .   ,  ,    .     ,   .    ,  , -     .   ,       ,  .  ,      ,   .     .  ,    .

        RNNoise,   Mozilla.   ,  .        .   . , ,  .        ,      .  .  .  .    .

 ,            .       .      .  ,      ,     ,        .     ,      ,                     .     .

   .  ,         .      .          .   ,   ,     .    ,   ,   .     .        ,  , ,    .        .

             ,    .     ,       .             .  ,    ,  .         .  ,   ,   ,    .

     .     ,  .      ,    :    ,   ,  .    :   ?   ?    ?        , ,  .

  .     .   .    . ,     ,     .

.




.      


   

   .     .  ,    :   .    , , ,   .    .    .         ,      ,       .

     .  .  .  .      ,          , ,  .     ,    ,   .

   .   ,    ,          ,  .      ,   .

    .     .      .   ? .    ?  .    - ?   .

    ,     .          ,   .   ,        .   ,      .

            .         ,    .        ,      .    ,    ,        .     ,      .

    

     .    :  ,  ,  .      librosa.effects.split, nr.reduce_noise, pyln.normalize.loudness    .   ,  ,   ,  .   : .

     .    .  ,      ,        ,           .      .       .   : .

      .  ,    ,   .    ,      ,        ,   .             ,    ,    .   :   .

    ,    .  ,   .  ,     .  ,   ,   .

     

   ,    .     .      - .   ,     .     .  ,  ,   ,   .          .

       , ,  ?    .  :     ,     ,   .  ,     .      . Ÿ         ,   .         ,       ,       .

   ,   .   ,      ,       .   ,                 .              .    ,  , ,     .

  ,     ,    .    ,     ,      .       ,    ,    .          ,    .         ,        .

     

       .  ,        .  ,       .  ,               .

  .     .     .               .              ,    .             .

   .   ,   ,   .     .     ,   . ,       . ,      .    ,         ,     .

  ,    .  - ,   ,     ,       .   ,    ,     .

   

    .         ,        .

      :    ,     ,      .       PyTorch          ,   .     ,       .

          :   .  ,         ,    ,     ,      .

       .  ,         , ,  ,         .    ,      .

       ,         .       ,         .

       .     Open-Unmix  Demucs,             .      ,    .

      .    ,       .             .

      .  ,    Text-to-Speech,        ,     .

         .   ,    ,  , ,    .    ,      .

        .      ,          ,       .

     .    ,   :             .

  

      Python 3.9  .  ,          : numpy, librosa, soundfile, scipy, matplotlib.       :

bash

pip install numpy librosa soundfile scipy matplotlib

    PyTorch.     ,     :

bash

pip install torch torchaudio

     NVIDIA        ,     CUDA.      pytorch.org (https://pytorch.org/).      .        ,       .

   torchaudio      ,   PyTorch.    ,             PyTorch,     .

      .         .         .

   

  :     .     ,   .            ,    ,       ,   .

    .   .    ,     .           .           ,     .

,  , .  .    .      .   ,  .       ,    .

   

      .    ,             .     .  ,     .     .        .    .   ! ,    ,  !

    .    ,    ,     .     ,    ,   .  ,      .                  ,    .




 1.    :   


    .  .     

   

,       ,    :   . ,  ,  ,            .    ,      .     ,      .

            .  ,   ,    , ,   .      Python,       .           ,       .

      .  ,    ,      .    PyTorch  ,        .     ,         ,    .

           ,  ,  ,    .  .   .

  

      ,     :      ?      . . .  .  .  .       .  ?

,     ,       .          .        .      .   ,             ,         .    :     X    Y,   .       Z,   .

  .     .       .         .       .       .         ,   -    .

    .    .   dataset   :   ,   ,   .    ,     ,      .         .   ,   ,   .

 ,    ,     .     .     . ,       .       ,      ,     .

 : ,  

          .         ,      ,     ,    .    ,  .

      .         ,     .      ,             .   ,     .           .       .

   :

y = f(w1x1 + w2x2 + ... + wnxn + b)

 x1, x2  , w1, w2  , b  , f   , y  .

     ,      .    ,     .             .   ,    ,       .   -   ,   .   .      .

   Python

     .            : ,    .

python

import numpy as np

class SingleNeuron:

"""

  .

       .

"""

def __init__(self):

#       

self.weight = np.random.randn() * 0.1

self.bias = np.random.randn() * 0.1



def sigmoid(self, x):

"""  :       0  1."""

return 1.0 / (1.0 + np.exp(-x))



def forward(self, x):

"""

 :      x.

x    (,   ).

"""

z = self.weight * x + self.bias

return self.sigmoid(z)



def train_step(self, x, y_true, learning_rate=0.1):

"""

  .

x   

y_true    (0  1)

learning_rate   

"""

#  

y_pred = self.forward(x)



#   (    )

error = y_pred - y_true



#   (   )

#  :  = y_pred * (1 - y_pred)

d_weight = error * y_pred * (1 - y_pred) * x

d_bias = error * y_pred * (1 - y_pred)



#  

self.weight -= learning_rate * d_weight

self.bias -= learning_rate * d_bias



return error

#  

neuron = SingleNeuron()

#   

# ,       

#   0.8    ( 1)

#   0.2    ( 0)

train_data = [

(0.15, 0.0), (0.22, 0.0), (0.18, 0.0), (0.25, 0.0), (0.10, 0.0),  # 

(0.75, 1.0), (0.82, 1.0), (0.70, 1.0), (0.88, 1.0), (0.79, 1.0),  # 

]

print("  :")

print(" |  |  |  |  |  | ")

print("-" * 70)

#  

for epoch in range(200):

total_error = 0

for x, y_true in train_data:

y_pred = neuron.forward(x)

error = neuron.train_step(x, y_true, learning_rate=0.5)

total_error += abs(error)

if epoch % 50 == 0 or epoch < 10:

x_example, y_example = train_data[0]

y_pred_example = neuron.forward(x_example)

print(f"{epoch:5d} | {neuron.weight:+.3f} | {neuron.bias:+.3f} | "

f"{x_example:.2f} | {y_pred_example:.4f} | {y_example:.1f} | "

f"{abs(y_pred_example - y_example):.4f}")

#   

print("\n  :")

test_data = [0.12, 0.30, 0.65, 0.90, 0.20, 0.85]

for x in test_data:

pred = neuron.forward(x)

label = "" if pred > 0.5 else ""

print(f"  : {x:.2f} -> {pred:.4f} ({label})")

  .  ,           0.5   .     .          :    0.2    ,    0.8    .

  .            .       ,   .      .     :      ,     1.

      .          .        ,   .      ,  .    .

   

            .        .     ,      ,         ,  ,      .

     .       .       .          .      .   .      ,         .

    ,    ,    :            .     .  ,          ,         .

 : , , 

        .            ,   .     PyTorch  ,         .

python

import torch

import torch.nn as nn

import torch.optim as optim

import librosa

import numpy as np

def extract_features(y, sr, frame_size=2048, hop_length=512):

"""

       .

      .

"""

features = []



#   

num_frames = (len(y) - frame_size) // hop_length + 1



for i in range(num_frames):

start = i * hop_length

frame = y[start:start + frame_size]



#  1: RMS  ( )

rms = np.sqrt(np.mean(frame ** 2))



#  2: Zero-crossing rate (  )

zcr = np.sum(np.abs(np.diff(np.sign(frame)))) / (2 * len(frame))



#  3:   (  )

spectrum = np.abs(np.fft.fft(frame * np.hamming(len(frame))))

freqs = np.fft.fftfreq(len(frame), 1 / sr)

positive_freqs = freqs[:len(freqs)//2]

positive_spectrum = spectrum[:len(spectrum)//2]

if np.sum(positive_spectrum) > 0:

centroid = np.sum(positive_freqs * positive_spectrum) / np.sum(positive_spectrum)

else:

centroid = 0



#  4:   (,   85% )

cumsum = np.cumsum(positive_spectrum ** 2)

if cumsum[-1] > 0:

rolloff_idx = np.where(cumsum >= 0.85 * cumsum[-1])[0][0]

rolloff = positive_freqs[rolloff_idx]

else:

rolloff = 0



features.append([rms, zcr, centroid / 1000, rolloff / 1000])



return np.array(features, dtype=np.float32)

class SoundClassifier(nn.Module):

"""

    .

4   -> 16   -> 8   -> 3 

"""

def __init__(self):

super().__init__()

self.fc1 = nn.Linear(4, 16)   #   ->   1

self.fc2 = nn.Linear(16, 8)   #   1 ->   2

self.fc3 = nn.Linear(8, 3)    #   2 ->   (3 )

self.relu = nn.ReLU()         #   ReLU

self.softmax = nn.Softmax(dim=1)  #   



def forward(self, x):

x = self.relu(self.fc1(x))

x = self.relu(self.fc2(x))

x = self.softmax(self.fc3(x))

return x

#  ,    

model = SoundClassifier()

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

print(" :")

print(model)

print(f"\n : {sum(p.numel() for p in model.parameters())}")

       :  ,   ,     .    ,        ,   ,   .   ,          ,    .   ,         ,    .

    .          .     .         : , , .   ReLU            ,     .

 

     .      .      :  ,   ,     ,   .

python

def generate_synthetic_data(n_samples=500):

"""

     .

        .

"""

np.random.seed(42)

data = []

labels = []



for _ in range(n_samples):

#  0: 

data.append([

np.random.normal(0.3, 0.1),   # rms:  

np.random.normal(0.15, 0.05),  # zcr:    

np.random.normal(0.8, 0.2),    # centroid:  

np.random.normal(0.6, 0.3),    # rolloff:  

])

labels.append(0)



#  1: 

data.append([

np.random.normal(0.4, 0.15),   # rms:  

np.random.normal(0.25, 0.08),  # zcr: 

np.random.normal(1.5, 0.5),    # centroid: 

np.random.normal(2.0, 1.0),    # rolloff: 

])

labels.append(1)



#  2: 

data.append([

np.random.normal(0.2, 0.1),    # rms:  

np.random.normal(0.45, 0.1),   # zcr: 

np.random.normal(2.5, 1.0),    # centroid: 

np.random.normal(3.0, 1.5),    # rolloff: 

])

labels.append(2)



# 

data = np.array(data, dtype=np.float32)

labels = np.array(labels)

shuffle_idx = np.random.permutation(len(data))

return data[shuffle_idx], labels[shuffle_idx]

#  

X, y = generate_synthetic_data(500)

#      

split = int(0.8 * len(X))

X_train, X_test = X[:split], X[split:]

y_train, y_test = y[:split], y[split:]

#    PyTorch

X_train_tensor = torch.tensor(X_train)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_test_tensor = torch.tensor(X_test)

y_test_tensor = torch.tensor(y_test, dtype=torch.long)

print(f" : {len(X_train)}")

print(f" : {len(X_test)}")

#  

model = SoundClassifier()

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 200

for epoch in range(num_epochs):

#  

outputs = model(X_train_tensor)

loss = criterion(outputs, y_train_tensor)



#    

optimizer.zero_grad()

loss.backward()

optimizer.step()



if epoch % 40 == 0 or epoch == num_epochs - 1:

#     

with torch.no_grad():

test_outputs = model(X_test_tensor)

_, predicted = torch.max(test_outputs, 1)

accuracy = (predicted == y_test_tensor).float().mean()



print(f" {epoch:3d}:  = {loss.item():.4f},  = {accuracy:.2%}")

print("\n !")

  .  ,       ,      .             ,           .

,    . criterion = nn.CrossEntropyLoss()   ,  ,       . optimizer = optim.Adam(model.parameters(), lr=0.01)   Adam,        . loss.backward()   PyTorch:       . optimizer.step()       .

    

       .   ,    ,       .

python

def classify_audio_file(filepath, model, label_names):

"""   ."""

y, sr = librosa.load(filepath, sr=22050, mono=True)

features = extract_features(y, sr)



#   

features_tensor = torch.tensor(features)



# 

with torch.no_grad():

outputs = model(features_tensor)

_, predicted = torch.max(outputs, 1)



#  

unique, counts = np.unique(predicted.numpy(), return_counts=True)

total = len(predicted)



print(f"\n : {filepath}")

print(f"   : {total}")

for cls, count in zip(unique, counts):

print(f"  {label_names[cls]}: {count}  ({count / total:.1%})")



return predicted.numpy()

#    ( )   

label_names = ['', '', '']

#   :  2    (),  ,    

sr = 22050

t = np.linspace(0, 4.0, int(sr * 4.0), endpoint=False)

test_signal = np.zeros_like(t)

# :   

test_signal[:int(1.5 * sr)] = (np.sin(2 * np.pi * 440 * t[:int(1.5 * sr)]) +

0.5 * np.sin(2 * np.pi * 880 * t[:int(1.5 * sr)]))

# 

test_signal[int(1.5 * sr):int(3.0 * sr)] = np.random.randn(int(1.5 * sr)) * 0.3

#  (  )      

try:

speech, _ = librosa.load('voice_sample.wav', sr=sr, mono=True)

test_signal[int(3.0 * sr):] = speech[:len(test_signal) - int(3.0 * sr)]

except:

test_signal[int(3.0 * sr):] = np.sin(2 * np.pi * 220 * t[int(3.0 * sr):])

import soundfile as sf

sf.write('test_mixed.wav', test_signal, sr)

print(" test_mixed.wav   ,   ")

# 

predictions = classify_audio_file('test_mixed.wav', model, label_names)

     .    :    ,   ,   .   ,  ,    ,        .

 :  

           1958 .       ,        .  ,      .

 1969        ,    ,           XOR   .          ,     .

   1980-,       ,          XOR    .       ,      .    XOR,   ,        .

 

  ,   .   .        .       .   0.1, 0.01, 0.001.  Adam     0.0010.0001.

     ,    .  .      ,    . :   ,  ,    .

    ,    .  .       ,  ,     . :       .

     CPU    .        .   ,  GPU:  PyTorch          .to('cuda').

 

   : ,        .        extract_features.           .    ?

    ,    .   ,     .          ?

  .    (, RMS   ),        :         ,   .  ,     .





               .  ,       ,    ,    ,    .




 2.   :   


   

          .          : ,   ,  .  ,       .   ,   .   :    ,  ,    .  ,    - ? ,            ,   -  ,     ?

        .         :  .  .      :       .         22 050    22 050 .        ,     .

    ,       .      :    ,         .      ,            .  - :      .    ,    ,      .      , ,    ,      .

         .  ,     ,    .    ,   ,         ,     - .

    

     .     ,    ,       ,          .       ,    STFT.   ,        .

    ,    .            ,      .            .      .         ,   .        .

 ,   ,   , , .      ,    ,          ,   ,    ,       :  .    ,   .   ,      .

   

  ,       ,     .

python

import torch

import torchaudio

import librosa

import numpy as np

def audio_to_mel_spectrogram(y, sr, n_mels=128, n_fft=2048, hop_length=512):

"""

   -.



-     :

   ,   .

"""

#    PyTorch,   numpy-

if isinstance(y, np.ndarray):

y = torch.tensor(y, dtype=torch.float32)



#  

mel_transform = torchaudio.transforms.MelSpectrogram(

sample_rate=sr,

n_fft=n_fft,

hop_length=hop_length,

n_mels=n_mels,

power=2.0

)



# 

mel_spec = mel_transform(y)




  .


   .

   ,     (https://www.litres.ru/pages/biblio_book/?art=74004886)  .

      Visa, MasterCard, Maestro,    ,   ,     ,  PayPal, WebMoney, ., QIWI ,       .


