Steps to create baseline results with ML models on one hot encoded data.
acp_data = ACPDataset(DATA_STORE)
amp_data = AMPDataset(DATA_STORE)
dna_data = DNABindDataset(DATA_STORE)
Building baseline models for the ACP dataset with one hot encoding and evaluating prediction on test set with and without dimensionality reduction.
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
X_train = ohe.fit_transform(acp_data.X_train)
y_train = acp_data.y_train
X_test = ohe.transform(acp_data.X_test)
y_test = acp_data.y_test
X_train.shape, y_train.shape, X_test.shape, y_test.shape
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
print(f'X_train_pca.shape: {X_train_pca.shape}')
print(
f"Explained variance ratio of the first 10 principal components:\n{pca.explained_variance_ratio_[:10]}"
)
visualize_2pcs(X_train_pca, y_train)
visualize_3pcs(X_train_pca, y_train)
Evaluation on full data
train_predict(X_train, y_train, X_test, y_test)
Evaluation on reduced data
X_test_pca = pca.transform(X_test)
train_predict(X_train_pca, y_train, X_test_pca, y_test)
Creating data and learner objects to perform grid search.
X_train, y_train = acp_data.X_train, acp_data.y_train
X_test, y_test = acp_data.X_test, acp_data.y_test
acp_learner = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=False)
acp_learner_pca = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=True)
acp_learner.pipeline.steps
acp_learner_pca.pipeline.steps
_, _ = acp_learner.train()
_, _ = acp_learner_pca.train()
acp_learner_pca.run_label_spreading()
On full dataset.
acp_learner.predict()
On dim reduced dataset.
acp_learner_pca.predict()
Save results.
acp_learner.predict_results.to_csv(f'{EXPERIMENT_STORE}/acp_learner.csv')
acp_learner_pca.predict_results.to_csv(f'{EXPERIMENT_STORE}/acp_learner_pca.csv')
X_pca = acp_learner_pca.pick_k()
acp_learner_pca.analyze_clusters(X_pca, k=7)
amp_data.train.head(5)
amp_data.test.head(5)
amp_data.train["length"].max(), amp_data.test["length"].max()
amp_data.X_train.shape[1], amp_data.X_test.shape[1]
One Hot Encoding
amp_data.X_train.shape, amp_data.X_test.shape
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
X_train = ohe.fit_transform(amp_data.X_train)
y_train = amp_data.y_train
X_test = ohe.transform(amp_data.X_test)
y_test = amp_data.y_test
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Dimensionality Reduction - PCA
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_train_pca.shape
print(
f"Explained variance ratio of the first 10 principal components:\n{pca.explained_variance_ratio_[:10]}"
)
visualize_2pcs(X_train_pca, y_train)
visualize_3pcs(X_train_pca, y_train)
On full data
train_predict(X_train, y_train, X_test, y_test)
On Dim Reduced Data
X_test_pca = pca.transform(X_test)
train_predict(X_train_pca, y_train, X_test_pca, y_test)
X_train, y_train = amp_data.X_train, amp_data.y_train
X_test, y_test = amp_data.X_test, amp_data.y_test
amp_learner = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=False)
amp_learner_pca = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=True)
_, _ = amp_learner.train()
_, _ = amp_learner_pca.train()
amp_learner_pca.run_label_spreading()
On full dataset.
amp_learner.predict()
On dim reduced dataset.
amp_learner_pca.predict()
Save results.
amp_learner.predict_results.to_csv(f'{EXPERIMENT_STORE}/amp_learner.csv')
amp_learner_pca.predict_results.to_csv(f'{EXPERIMENT_STORE}/amp_learner_pca.csv')
X_pca = amp_learner_pca.pick_k()
amp_learner_pca.analyze_clusters(X_pca, k=6)
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
X_train = ohe.fit_transform(dna_data.X_train)
y_train = dna_data.y_train
X_test = ohe.transform(dna_data.X_test)
y_test = dna_data.y_test
X_train.shape, y_train.shape, X_test.shape, y_test.shape
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
print(f'X_train_pca.shape: {X_train_pca.shape}')
print(
f"Explained variance ratio of the first 10 principal components:\n{pca.explained_variance_ratio_[:10]}"
)
visualize_2pcs(X_train_pca, y_train)
visualize_3pcs(X_train_pca, y_train)
Evaluation on full data
train_predict(X_train, y_train, X_test, y_test)
Evaluation on reduced data
X_test_pca = pca.transform(X_test)
train_predict(X_train_pca, y_train, X_test_pca, y_test)
X_train, y_train = dna_data.X_train, dna_data.y_train
X_test, y_test = dna_data.X_test, dna_data.y_test
dna_learner = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=False)
dna_learner_pca = Learner(X_train, y_train, X_test, y_test, ohe=True, pca=True)
_, _ = dna_learner.train()
_, _ = dna_learner_pca.train()
dna_learner_pca.run_label_spreading()
On full dataset.
dna_learner.predict()
On dim reduced dataset.
dna_learner_pca.predict()
Save results.
dna_learner.predict_results.to_csv(f'{EXPERIMENT_STORE}/dna_learner.csv')
dna_learner_pca.predict_results.to_csv(f'{EXPERIMENT_STORE}/dna_learner_pca.csv')
X_pca = dna_learner_pca.pick_k()
dna_learner_pca.analyze_clusters(X_pca, k=7)