diff options
author | xAlpharax <42233094+xAlpharax@users.noreply.github.com> | 2023-12-10 08:27:00 +0200 |
---|---|---|
committer | xAlpharax <42233094+xAlpharax@users.noreply.github.com> | 2023-12-10 08:27:00 +0200 |
commit | ccbbd775a921f92e5472ccf53317d9e31bd9152d (patch) | |
tree | b115396769ced53e1bc088eef78e645e1edd4c83 /prototype.py | |
parent | 316148c4ced0da3f2592d04c09acceeb346ce07b (diff) |
Remade some parts of the readme and re-aligned the repo structure.
Changes to be committed:
modified: README.md
renamed: prototype.py -> ml-integrations/prototype.py
new file: testing/.gitkeep
new file: training/.gitkeep
Diffstat (limited to 'prototype.py')
-rw-r--r-- | prototype.py | 148 |
1 files changed, 0 insertions, 148 deletions
diff --git a/prototype.py b/prototype.py deleted file mode 100644 index 35461eb..0000000 --- a/prototype.py +++ /dev/null @@ -1,148 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -from torch.utils.data import Dataset, DataLoader - -from sklearn.model_selection import train_test_split - -# Sample data - replace this with your dataset -courses_data = [ - {"sequence": "python programming", "label": "programming"}, - {"sequence": "web development with HTML and CSS", "label": "web development"}, - # Add more data... -] - -# Preprocess the data -sequences = [d["sequence"] for d in courses_data] -labels = [d["label"] for d in courses_data] - -# Create a mapping from labels to unique indices -label2index = {label: idx for idx, label in enumerate(set(labels))} -index2label = {idx: label for label, idx in label2index.items()} - -# Convert labels to indices -label_indices = [label2index[label] for label in labels] - -# Split the data into training and testing sets -train_sequences, test_sequences, train_labels, test_labels = train_test_split( - sequences, label_indices, test_size=0.2, random_state=42 -) - -# Define a simple RNN model -class CourseRecommendationModel(nn.Module): - def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes): - super(CourseRecommendationModel, self).__init__() - self.embedding = nn.Embedding(vocab_size, embedding_dim) - self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True) - self.fc = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = self.embedding(x) - _, hn = self.rnn(x) - output = self.fc(hn[-1, :, :]) - return output - -## -# Define a simple Transformer model -class CourseRecommendationModel(nn.Module): - def __init__(self, vocab_size, embedding_dim, hidden_size, num_heads, num_layers, num_classes): - super(CourseRecommendationModel, self).__init__() - self.embedding = nn.Embedding(vocab_size, embedding_dim) - self.transformer = nn.Transformer( - d_model=embedding_dim, - nhead=num_heads, - num_encoder_layers=num_layers, - num_decoder_layers=num_layers, - ) - self.fc = nn.Linear(embedding_dim, num_classes) - - def forward(self, x): - x = self.embedding(x) - x = x.permute(1, 0, 2) # Change the sequence length dimension - output = self.transformer(x) - output = output.mean(dim=0) # Aggregate over the sequence dimension - output = self.fc(output) - return output - -# Hyperparameters (transformer) -vocab_size = 10000 # Replace with the actual vocabulary size -embedding_dim = 50 -num_heads = 4 -num_layers = 2 -num_classes = len(set(labels)) -batch_size = 32 -learning_rate = 0.001 -epochs = 10 -## - -# Hyperparameters -vocab_size = 10000 # Replace with the actual vocabulary size -embedding_dim = 50 -hidden_size = 64 -num_classes = len(set(labels)) -batch_size = 32 -learning_rate = 0.001 -epochs = 10 - -# Convert sequences to numerical format -# In a real-world scenario, you might want to use tokenization libraries like spaCy or nltk. -# For simplicity, we'll represent each word with an index in this example. -sequence_indices = [[vocab_size // 2 if word == "python" else vocab_size // 3 for word in sequence.split()] for sequence in train_sequences] - -# Create DataLoader for training -class CourseDataset(Dataset): - def __init__(self, sequences, labels): - self.sequences = sequences - self.labels = labels - - def __len__(self): - return len(self.sequences) - - def __getitem__(self, idx): - return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx]) - -train_dataset = CourseDataset(sequence_indices, train_labels) -train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) - -## transformer -# Initialize the model, loss function, and optimizer -#model = CourseRecommendationModel(vocab_size, embedding_dim, embedding_dim, num_heads, num_layers, num_classes) - -# Initialize the model, loss function, and optimizer -model = CourseRecommendationModel(vocab_size, embedding_dim, hidden_size, num_classes) -criterion = nn.CrossEntropyLoss() -optimizer = optim.Adam(model.parameters(), lr=learning_rate) - -# Training loop -for epoch in range(epochs): - for batch_seq, batch_labels in train_loader: - optimizer.zero_grad() - output = model(batch_seq) - loss = criterion(output, batch_labels) - loss.backward() - optimizer.step() - - print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}") - -# Save the trained model -#torch.save(model.state_dict(), 'transformer_course_recommendation_model.pth') -torch.save(model.state_dict(), 'course_recommendation_model.pth') - -# Evaluate on test data (similar preprocessing as done for training data) -test_sequence_indices = [[vocab_size // 2 if word == "python" else vocab_size // 3 for word in sequence.split()] for sequence in test_sequences] -test_dataset = CourseDataset(test_sequence_indices, test_labels) -test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) - -model.eval() -correct = 0 -total = 0 - -with torch.no_grad(): - for batch_seq, batch_labels in test_loader: - output = model(batch_seq) - _, predicted = torch.max(output, 1) - total += batch_labels.size(0) - correct += (predicted == batch_labels).sum().item() - -accuracy = correct / total -print(f"Accuracy on test data: {accuracy * 100:.2f}%") |