diff --git a/notebooks/spectral_classifier_big/accuracy_graph_v1.png b/notebooks/spectral_classifier_big/accuracy_graph_v1.png deleted file mode 100644 index f8a4df3979eabd5703067bcb92ee451d6c58a24a..0000000000000000000000000000000000000000 Binary files a/notebooks/spectral_classifier_big/accuracy_graph_v1.png and /dev/null differ diff --git a/notebooks/spectral_classifier_big/accuracy_graph_v2.png b/notebooks/spectral_classifier_big/accuracy_graph_v2.png deleted file mode 100644 index e081d540383484039d2517929a176e8d39428dfe..0000000000000000000000000000000000000000 Binary files a/notebooks/spectral_classifier_big/accuracy_graph_v2.png and /dev/null differ diff --git a/notebooks/spectral_classifier_big/download_fits.ipynb b/notebooks/spectral_classifier_big/download_fits.ipynb index 6489a78564d16afb83fa6e27f7bdbc32e852bac7..4719a56ae737e15ebc269526d2bad32f738f09c1 100644 --- a/notebooks/spectral_classifier_big/download_fits.ipynb +++ b/notebooks/spectral_classifier_big/download_fits.ipynb @@ -24,8 +24,8 @@ "########## Input ##########\n", "\n", "targer_directory = 'F:\\data\\spectral_fits_big\\\\'\n", - "\n", "class_names = ['star','galaxy', 'QSO', 'AGN'] \n", + "samples_per_class = 11000\n", "\n", "## Queries für star, galaxy, quasar und AGN\n", "query1 = \"SELECT top 11000 plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'star' GROUP BY plate, mjd, class ORDER BY plate, mjd, class\"\n", diff --git a/notebooks/spectral_classifier_big/loss_graph_v1.png b/notebooks/spectral_classifier_big/loss_graph_v1.png deleted file mode 100644 index 0af9ab32631701553ce63952bc1faeacfa4a8968..0000000000000000000000000000000000000000 Binary files a/notebooks/spectral_classifier_big/loss_graph_v1.png and /dev/null differ diff --git a/notebooks/spectral_classifier_big/loss_graph_v2.png b/notebooks/spectral_classifier_big/loss_graph_v2.png deleted file mode 100644 index 34d1ef9218ae02b756df163da6b880ff6c39cc4b..0000000000000000000000000000000000000000 Binary files a/notebooks/spectral_classifier_big/loss_graph_v2.png and /dev/null differ diff --git a/notebooks/spectral_classifier_big/spectral_classifier_from_npy.ipynb b/notebooks/spectral_classifier_big/spectral_classifier_from_npy.ipynb index 4995e52f8159208eb9da0d6992da1b02ce548fb4..96242ec60e8c9f206674337017049431d72ad3d2 100644 --- a/notebooks/spectral_classifier_big/spectral_classifier_from_npy.ipynb +++ b/notebooks/spectral_classifier_big/spectral_classifier_from_npy.ipynb @@ -289,55 +289,18 @@ "split_index = int(len(data_shuffled)*0.9)\n", "\n", "# Daten\n", - "data_training = data_shuffled[:split_index]\n", - "data_test = data_shuffled[split_index:]\n", + "data_training = np.asarray(data_shuffled[:split_index])\n", + "data_test = np.asarray(data_shuffled[split_index:])\n", "\n", "# Labels\n", - "labels_training = labels_shuffled[:split_index]\n", - "labels_test = labels_shuffled[split_index:]\n", + "labels_training = np.asarray(labels_shuffled[:split_index])\n", + "labels_test = np.asarray(labels_shuffled[split_index:])\n", "\n", "# Galaxie numbers\n", "numbers_training = numbers_shuffled[:split_index]\n", "numbers_test = numbers_shuffled[split_index:]" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(3600, 3522)\n", - "(400, 3522)\n", - "(3600,)\n", - "(400,)\n" - ] - } - ], - "source": [ - "data_training = np.asarray(data_training)\n", - "data_test = np.asarray(data_test)\n", - "\n", - "labels_training = np.asarray(labels_training)\n", - "labels_test = np.asarray(labels_test)\n", - "\n", - "print(data_training.shape)\n", - "print(data_test.shape)\n", - "\n", - "print(labels_training.shape)\n", - "print(labels_test.shape)" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/scripts/create_modell.py b/scripts/create_modell.py deleted file mode 100644 index f60e07cae0bf98bb7a32544582cb8e9670aa8a87..0000000000000000000000000000000000000000 --- a/scripts/create_modell.py +++ /dev/null @@ -1,6 +0,0 @@ -import tensorflow as tf -import matplotlib.pyplot as plt -import numpy as np - -print("test") - diff --git a/scripts/conv_network.py b/scripts/digit_recognizer/mnist_conv_example.py similarity index 100% rename from scripts/conv_network.py rename to scripts/digit_recognizer/mnist_conv_example.py diff --git a/scripts/run_example.py b/scripts/digit_recognizer/mnist_example.py similarity index 100% rename from scripts/run_example.py rename to scripts/digit_recognizer/mnist_example.py diff --git a/scripts/main.py b/scripts/main.py deleted file mode 100644 index 79b97d838bfc48dccdad27ce360b630cbb0dc88b..0000000000000000000000000000000000000000 --- a/scripts/main.py +++ /dev/null @@ -1,45 +0,0 @@ -### Importieren aller benötigten Module ### - -import tensorflow as tf -import matplotlib.pyplot as plt -import numpy as np - -#Für die bessere Lesbarkeit -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Activation, Dense, Flatten, Conv2D, MaxPool2D -from tensorflow.keras.metrics import Accuracy - -### Definition aller Funktionen ### - - -def load_data_mnist(testdata=False): - """Lädt den MNIST Trainings- oder Testdatensatz. - - Args: - testdata (boolean): Ob Testdaten geladen werden sollen. - - Returns: - Numpy array: (samples,labels) - train_samples.shape == (60000,28,28) - train_labels.shape == (60000,) - test_samples.shape == (10000,28,28) - test_labels.shape == (10000,) - """ - mnist = tf.keras.datasets.mnist - if testdata==ArithmeticErrorFalse: - (train_samples, train_labels) = mnist.load_data()[0] - print("MNIST Trainingsdatensatz geladen.") - return (train_samples, train_labels) - else: - (test_samples, test_labels) = mnist.load_data()[1] - print("MNIST Testdatensatz geladen.") - return (test_samples, test_labels) - -def create_modell(): - return - -def train_modell(): - return - -def test_modell(): - return diff --git a/scripts/spectral_classifier/__pycache__/benchmark.cpython-38.pyc b/scripts/spectral_classifier/__pycache__/benchmark.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e255fa1d8e386364b80479fd5eaf07f02703bffc Binary files /dev/null and b/scripts/spectral_classifier/__pycache__/benchmark.cpython-38.pyc differ diff --git a/scripts/spectral_classifier/__pycache__/plots.cpython-38.pyc b/scripts/spectral_classifier/__pycache__/plots.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfe132a6140c8544f8f4fd54daa9eec8e6caae44 Binary files /dev/null and b/scripts/spectral_classifier/__pycache__/plots.cpython-38.pyc differ diff --git a/scripts/spectral_classifier/__pycache__/rechnen.cpython-38.pyc b/scripts/spectral_classifier/__pycache__/rechnen.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8107308ee62f4b2614a53c608948b3f9c089fdc Binary files /dev/null and b/scripts/spectral_classifier/__pycache__/rechnen.cpython-38.pyc differ diff --git a/scripts/spectral_classifier/benchmark.py b/scripts/spectral_classifier/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc005d2994ce4c71677ae7903f052466d3cc691 --- /dev/null +++ b/scripts/spectral_classifier/benchmark.py @@ -0,0 +1,38 @@ +from sklearn import svm +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB + +def benchmark_svc(x_train, y_train, x_test, y_test, gamma=0.001, C=100.): + clf = svm.SVC(gamma=0.001, C=100.) + clf.fit(x_train, y_train) + result = clf.score(x_test, y_test) + print("SVC: " + str(result)) + return result + +def benchmark_LogisticRegression(x_train, y_train, x_test, y_test, max_iter=1000, random_state=123): + clf = LogisticRegression(max_iter=1000, random_state=123) + clf.fit(x_train, y_train) + result = clf.score(x_test, y_test) + print("LogisticRegression: " + str(result)) + return result + +def benchmark_RandomForestClassifier(x_train, y_train, x_test, y_test, n_estimators=100, random_state=123): + clf = RandomForestClassifier(n_estimators=100, random_state=123) + clf.fit(x_train, y_train) + result = clf.score(x_test, y_test) + print("RandomForestClassifier: " + str(result)) + return result + +def benchmark_GaussianNB(x_train, y_train, x_test, y_test): + clf = GaussianNB() + clf.fit(x_train, y_train) + result = clf.score(x_test, y_test) + print("GaussianNB: " + str(result)) + return result + +def benchmark_all(x_train, y_train, x_test, y_test, gamma=0.001, C=100., max_iter=1000, random_state=123, n_estimators=100): + benchmark_svc(x_train, y_train, x_test, y_test, gamma, C) + benchmark_LogisticRegression(x_train, y_train, x_test, y_test, max_iter, random_state) + benchmark_RandomForestClassifier(x_train, y_train, x_test, y_test, n_estimators, random_state) + benchmark_GaussianNB(x_train, y_train, x_test, y_test) \ No newline at end of file diff --git a/scripts/download_sdss_fits.py b/scripts/spectral_classifier/download_fits_from_sdss.py similarity index 55% rename from scripts/download_sdss_fits.py rename to scripts/spectral_classifier/download_fits_from_sdss.py index 9dcaac06eb3960a4b11321b09bb6fbaac6fa6359..8b7432daee73a317427418fd63e541f299faa497 100644 --- a/scripts/download_sdss_fits.py +++ b/scripts/spectral_classifier/download_fits_from_sdss.py @@ -13,13 +13,19 @@ from astropy.coordinates import ICRS import astropy.units as u import requests +########## Input ########## + +targer_directory = 'F:\data\spectral_fits\\' class_names = ['star','galaxy', 'QSO', 'AGN'] +samples_per_class = 1000 + ## Queries für star, galaxy, quasar und AGN +query1 = "SELECT top" + str(samples_per_class) + "plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'star' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" +query2 = "SELECT top " + str(samples_per_class) + " plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'galaxy' AND subClass != 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" +query3 = "SELECT top " + str(samples_per_class) + " plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'QSO' AND subClass != 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" +query4 = "SELECT top " + str(samples_per_class) + " plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE subClass = 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" -query1 = "SELECT top 1000 plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'star' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" -query2 = "SELECT top 1000 plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'galaxy' AND subClass != 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" -query3 = "SELECT top 1000 plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE class = 'QSO' AND subClass != 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" -query4 = "SELECT top 1000 plate, mjd, min(fiberid) as fiberid, class FROM SpecObj WHERE subClass = 'AGN' GROUP BY plate, mjd, class ORDER BY plate, mjd, class" +########## Program ########## queries = [query1, query2, query3, query4] @@ -45,7 +51,7 @@ for i in range(4): url = sdss_path + name r = requests.get(url) - target_file = 'F:\data\spectral_fits\\' + class_names[i] + '\\' + name[5:] + target_file = targer_directory + class_names[i] + '\\' + name[5:] with open(target_file,'wb') as f: diff --git a/scripts/spectral_classifier/fits_to_npy.py b/scripts/spectral_classifier/fits_to_npy.py new file mode 100644 index 0000000000000000000000000000000000000000..e91fc04d194095d7940940bb8a0b9863839c5d16 --- /dev/null +++ b/scripts/spectral_classifier/fits_to_npy.py @@ -0,0 +1,76 @@ +import numpy as np +from astropy.io import fits +import matplotlib.pyplot as plt +from astropy.wcs import WCS +import os +import csv + +########## Input ########## + +fits_path = 'F:\\data\\spectral_fits\\' +data_path = 'F:\\data\\' + +samples_per_class = 1000 + +smallest_wavelength = 4000 # in Angström +biggest_wavelength = 9000 + +########## Program ########## + +all_flux = [] + +for directory in os.listdir(fits_path): + + count_failed=0 + count_added=0 + + for filename in os.listdir(fits_path + directory + '\\'): + + path = fits_path + '\\' + directory + '\\' + filename + + # fits-Dateien öffnen und wavelength + flux einlesen + hdul = fits.open(path) + data = hdul[1].data + flux = data['flux'] + wavelength = 10**data['loglam'] + hdul.close() + + # first und last Index finden + for i in range(len(wavelength)): + if wavelength[i]>smallest_wavelength: + first_index = i + break + + for i in range(len(wavelength)): + if wavelength[i]>biggest_wavelength: + last_index = i + break + + # wavenlength und flux Listen schneiden + wavelength = wavelength[first_index:last_index] + flux = flux[first_index:last_index] + + if len(wavelength) == 3522 and count_added < samples_per_class: + all_flux.append(flux) + count_added += 1 + + if len(wavelength) != 3522: + print("Länge der Liste wavelength ist: " + str(len(wavelength))) + count_failed += 1 + + print(str(count_failed/1000*100) + "% waren nicht erfolgreich bei der Klasse:" + directory) + +# Numpy Arrays mit Daten füllen +data = np.array(all_flux) + +labels = np.zeros(shape=(4000,), dtype='int') +for i in range(4): + for t in range(samples_per_class): + labels[i*1000+t] = i + +wavelengths = np.array(wavelength) + +# Numpy arrays in .npy Dateien speichern +np.save(data_path + "data.npy", data) +np.save(data_path + "labels.npy", labels) +np.save(data_path + "wavelengths.npy", wavelengths) \ No newline at end of file diff --git a/scripts/spectral_classifier/plots.py b/scripts/spectral_classifier/plots.py new file mode 100644 index 0000000000000000000000000000000000000000..3d3de0e2e1e95b81901f5f675a7b695baf4023a6 --- /dev/null +++ b/scripts/spectral_classifier/plots.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt + +def plot_accuracy(name, history, ylim=[0,1.01]): + plt.plot(history.history['accuracy']) + plt.plot(history.history['val_accuracy']) + plt.title('model accuracy') + plt.ylabel('accuracy') + plt.xlabel('epoch') + plt.ylim(ylim[0], ylim[1]) + plt.legend(['training', 'validation'], loc='upper left') + plt.savefig(name) + plt.show() + +def plot_loss(name, history, ylim=[-0.5,2]): + plt.plot(history.history['loss']) + plt.plot(history.history['val_loss']) + plt.title('model loss') + plt.ylabel('loss') + plt.xlabel('epoch') + plt.ylim(ylim[0],ylim[1]) + plt.legend(['training', 'validation'], loc='upper left') + plt.savefig(name) + plt.show() \ No newline at end of file diff --git a/scripts/spectral_classifier/spectral_classifier.py b/scripts/spectral_classifier/spectral_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..ca7d348b819aac7bc3cc3ec83daf2d849fcdf6c4 --- /dev/null +++ b/scripts/spectral_classifier/spectral_classifier.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +from astropy.io import fits +import numpy as np +import os +import csv +import tensorflow as tf +import random + +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Activation, Dense, Flatten, Conv1D, MaxPooling1D, Dropout, InputLayer, GlobalAveragePooling1D +from tensorflow.keras.metrics import Accuracy + +from tensorflow import keras +from tensorflow.keras import layers + +import plots +import benchmark + +########## Input ########## + +data_path = 'F:\\data\\' +samples_per_class = 1000 + +########## Program ########## + +# Listen mit den flux Werten, Labels und Wellenlängen erstellen +data = np.load(data_path + "data.npy") +labels = np.load(data_path + "labels.npy") +wavelengths = np.load(data_path + "wavelengths.npy") + +# Liste die Galaxie-Nummer speichert +numbers = range(4*samples_per_class) + +# Datensatz mischen +z = list(zip(data, labels, numbers)) +random.shuffle(z) +data_shuffled, labels_shuffled, numbers_shuffled = zip(*z) + +split_index = int(len(data_shuffled)*0.9) + +# Trainings- und Testdatensatz erstellen +data_training = np.asarray(data_shuffled[:split_index]) +data_test = np.asarray(data_shuffled[split_index:]) + +labels_training = np.asarray(labels_shuffled[:split_index]) +labels_test = np.asarray(labels_shuffled[split_index:]) + +numbers_training = numbers_shuffled[:split_index] +numbers_test = numbers_shuffled[split_index:] + +# Daten in Form für Convolutional Network bringen +input_shape = (3522,1) +data_training_r = np.reshape(data_training, newshape=(len(data_training), input_shape[0], input_shape[1])) +data_test_r = np.reshape(data_test, newshape=(len(data_test), input_shape[0], input_shape[1])) + +# Netzwerk erstellen +model = Sequential([ + Conv1D(filters=64, kernel_size=80, strides=10, activation='relu', input_shape=(3522,1)), # stride + MaxPooling1D(3), #Pooling verringert Accuracy leicht aber verhindert overfitting + Dropout(0.35), + Conv1D(filters=128, kernel_size=40, strides=10, activation='relu'), + MaxPooling1D(3), + Dropout(0.35), + Flatten(), + Dense(units=128, activation='relu'), # Droput, weniger neuronen + Dropout(0.35), + Dense(units=4, activation='softmax') +]) + +model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + +x_train = data_training_r +x_test = data_test_r + +y_train = labels_training +y_test = labels_test + +history = model.fit(x_train, y_train, + epochs=75, validation_split=0.1, + shuffle=True, batch_size=200, + verbose=1) + +# Auswertung +plots.plot_accuracy("accuracy_graph_v2.png", history) +plots.plot_loss("loss_graph_v2.png", history) + +# Benchmark +benchmark.benchmark_all(data_training, labels_training, data_test, labels_test) \ No newline at end of file diff --git a/scripts/test_modell.py b/scripts/test_modell.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/scripts/train_modell.py b/scripts/train_modell.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000