Derek Banks (dmb3ey)
Camille Leonard (cvl7qu)
Gary Mitchell (gm3gq)
For this project, we sourced the Kepler Exoplanet Search Results collected by the Kepler Space Observatory from Kaggle. We created three tables: Exoplanet_info, Transit_Properties, and Stellar_Parameters in our database. The ER diagram and corresponding schema can be found below.
Exoplanet_info(kepoi_name, koi_score, kepid, kepler_name, koi_disposition)
Transit_Properties(kepoi_name, koi_impact, kepid, koi_insol, koi_duration, koi_period, koi_time0bk, koi_depth, koi_prad, koi_teq)
Stellar_Parameters(kepoi_name, kepid, koi_steff, koi_slogg, koi_srad)
has(Exoplanet.kepoi_name, Transit_properties.kepoi_name, Stellar_Parameters.kepoi_name)
import sqlite3
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os.path
#Remove Kepler Datbase if it exists.
if os.path.exists("Kepler.db"):
print("Database Exists... Deleting Kepler.db")
os.remove("Kepler.db")
Database Exists... Deleting Kepler.db
conn = sqlite3.connect('Kepler.db')
cursor = conn.cursor()
sql = '''
CREATE TABLE "Exoplanet_info" (
"kepid" NUMERIC,
"kepoi_name" TEXT,
"kepler_name" TEXT,
"koi_disposition" TEXT,
"koi_score" REAL,
PRIMARY KEY("kepoi_name")
);
'''
# Use the cursor to execute the statement
cursor.execute(sql)
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
# Output to screen so user knows what is going on
print("Inputting the following information into the database: ")
# Open a file for reading using csv.reader [See above for file contents]
with open('cumulative.csv', 'r') as dataFile:
reader = csv.reader(dataFile)
for kepData in reader: # for each row in reader...
# Find the length (i.e. how many elements in the list) (should be 3)
numElements = len(kepData)
#print(kepData)
kepid = kepData[1]
kepoi_name = kepData[2]
kepler_name = kepData[3]
koi_disposition = kepData[4]
koi_score = kepData[6]
sql = '''insert into Exoplanet_info
(kepid, kepoi_name, kepler_name, koi_disposition, koi_score)
values
(:kepid_ph, :kep_name_ph, :kepler_name_ph, :koi_disposition_ph, :koi_score_ph)'''
# These values are "named parameters" (like place holders)
# Tells the SQLite library that something will be substitued here
# Use the cursor to execute the statement
# Here, a dictionary has been added of the named parameters and the items
# to be inserted.
cursor.execute(sql, {'kepid_ph':kepid,
'kep_name_ph':kepoi_name,
'kepler_name_ph':kepler_name,
'koi_disposition_ph':koi_disposition,
'koi_score_ph':koi_score})
# Commit. Telling SQLite to save the new data. The data would be lost otherwise.
conn.commit()
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
Inputting the following information into the database:
conn = sqlite3.connect('Kepler.db')
cursor = conn.cursor()
sql = '''
CREATE TABLE "Transit_Properties" (
"kepid" NUMERIC,
"kepoi_name" TEXT,
"koi_period" REAL,
"koi_time0bk" REAL,
"koi_impact" REAL,
"koi_duration" REAL,
"koi_depth" REAL,
"koi_prad" REAL,
"koi_teq" REAL,
"koi_insol" REAL,
PRIMARY KEY("kepoi_name")
);
'''
# Use the cursor to execute the statement
cursor.execute(sql)
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
sqlite3.paramstyle = 'named'
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
# Output to screen so user knows what is going on
#print("Inputting the following information into the database: ")
# Open a file for reading using csv.reader [See above for file contents]
with open('cumulative.csv', 'r') as dataFile:
reader = csv.reader(dataFile)
for kepData in reader: # for each row in reader...
# Find the length (i.e. how many elements in the list) (should be 3)
numElements = len(kepData)
#print(kepData)
kepid = kepData[1]
kepoi_name = kepData[2]
koi_period = kepData[11]
koi_time0bk = kepData[14]
koi_impact = kepData[17]
koi_duration = kepData[20]
koi_depth = kepData[23]
koi_prad = kepData[26]
koi_teq = kepData[29]
koi_insol = kepData[32]
sql = '''insert into Transit_Properties
(kepid,
kepoi_name,
koi_period,
koi_time0bk,
koi_impact,
koi_duration,
koi_depth,
koi_prad,
koi_teq,
koi_insol)
values
(:kepid_ph,
:kep_name_ph,
:koi_period_ph,
:koi_time0bk_ph,
:koi_impact_ph,
:koi_duration_ph,
:koi_depth_ph,
:koi_prad_ph,
:koi_teq_ph,
:koi_insol_ph)'''
# These values are "named parameters" (like place holders)
# Tells the SQLite library that something will be substitued here
# Use the cursor to execute the statement
# Here, a dictionary has been added of the named parameters and the items
# to be inserted.
cursor.execute(sql, {'kepid_ph':kepid,
'kep_name_ph':kepoi_name,
'koi_period_ph':koi_period,
'koi_time0bk_ph':koi_time0bk,
'koi_impact_ph':koi_impact,
'koi_duration_ph':koi_duration,
'koi_depth_ph':koi_depth,
'koi_prad_ph':koi_prad,
'koi_teq_ph':koi_teq,
'koi_insol_ph':koi_insol})
# Commit. Telling SQLite to save the new data. The data would be lost otherwise.
conn.commit()
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
conn = sqlite3.connect('Kepler.db')
cursor = conn.cursor()
sql = '''
CREATE TABLE "Stellar_Parameters" (
"kepid" NUMERIC,
"kepoi_name" TEXT,
"koi_steff" REAL,
"koi_slogg" REAL,
"koi_srad" REAL,
PRIMARY KEY("kepoi_name")
);
'''
# Use the cursor to execute the statement
cursor.execute(sql)
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
# Output to screen so user knows what is going on
#print("Inputting the following information into the database: ")
# Open a file for reading using csv.reader [See above for file contents]
with open('cumulative.csv', 'r') as dataFile:
reader = csv.reader(dataFile)
for kepData in reader: # for each row in reader...
# Find the length (i.e. how many elements in the list) (should be 3)
numElements = len(kepData)
#print(kepData)
kepid = kepData[1]
kepoi_name = kepData[2]
koi_steff = kepData[38]
koi_slogg = kepData[41]
koi_srad = kepData[44]
sql = '''insert into Stellar_Parameters
(kepid,
kepoi_name,
koi_steff,
koi_slogg,
koi_srad)
values
(:kepid_ph,
:kep_name_ph,
:koi_steff_ph,
:koi_slogg_ph,
:koi_srad_ph)'''
# These values are "named parameters" (like place holders)
# Tells the SQLite library that something will be substitued here
# Use the cursor to execute the statement
# Here, a dictionary has been added of the named parameters and the items
# to be inserted.
cursor.execute(sql, {'kepid_ph':kepid,
'kep_name_ph':kepoi_name,
'koi_steff_ph':koi_steff,
'koi_slogg_ph':koi_slogg,
'koi_srad_ph':koi_srad})
# Commit. Telling SQLite to save the new data. The data would be lost otherwise.
conn.commit()
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT kepoi_name,koi_score FROM Exoplanet_Info WHERE koi_disposition='CONFIRMED'
''')
query = cursor.fetchall()
query_1 = pd.DataFrame(query,columns=['name','score'])
query_1['score'] = pd.to_numeric(query_1['score'])
query_1
name | score | |
---|---|---|
0 | K00752.01 | 1.000 |
1 | K00752.02 | 0.969 |
2 | K00755.01 | 1.000 |
3 | K00756.01 | 1.000 |
4 | K00756.02 | 1.000 |
... | ... | ... |
2288 | K03014.01 | 0.941 |
2289 | K03106.01 | 0.877 |
2290 | K00285.03 | 0.476 |
2291 | K03417.01 | 1.000 |
2292 | K04385.02 | 0.911 |
2293 rows × 2 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT kepoi_name, koi_disposition, koi_score FROM Exoplanet_Info WHERE koi_score > 0.95 AND kepoi_name IS NOT 'kepoi_name'
''')
query = cursor.fetchall()
query_2 = pd.DataFrame(query,columns=['name','disposition','score'])
query_2['score'] = pd.to_numeric(query_2['score'])
query_2
name | disposition | score | |
---|---|---|---|
0 | K00752.01 | CONFIRMED | 1.000 |
1 | K00752.02 | CONFIRMED | 0.969 |
2 | K00755.01 | CONFIRMED | 1.000 |
3 | K00756.01 | CONFIRMED | 1.000 |
4 | K00756.02 | CONFIRMED | 1.000 |
... | ... | ... | ... |
4727 | K07870.01 | CANDIDATE | 0.982 |
4728 | K04157.02 | CANDIDATE | 0.996 |
4729 | K04260.04 | CANDIDATE | 0.971 |
4730 | K03875.01 | CANDIDATE | 1.000 |
4731 | K03891.01 | CANDIDATE | 1.000 |
4732 rows × 3 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT kepoi_name,koi_score FROM Exoplanet_Info WHERE koi_score > 0.95 AND koi_disposition = 'CONFIRMED'
''')
query = cursor.fetchall()
query_3 = pd.DataFrame(query,columns=['name','score'])
query_3
name | score | |
---|---|---|
0 | K00752.01 | 1 |
1 | K00752.02 | 0.969 |
2 | K00755.01 | 1 |
3 | K00756.01 | 1 |
4 | K00756.02 | 1 |
... | ... | ... |
2037 | K00248.01 | 1 |
2038 | K00107.01 | 1 |
2039 | K02926.02 | 0.976 |
2040 | K02956.01 | 1 |
2041 | K03417.01 | 1 |
2042 rows × 2 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT koi_disposition,COUNT(koi_score) AS tally FROM Exoplanet_Info WHERE koi_disposition IS NOT 'koi_disposition' GROUP BY koi_disposition
''')
query = cursor.fetchall()
query_4 = pd.DataFrame(query,columns=['disposition','tally'])
query_4
disposition | tally | |
---|---|---|
0 | CANDIDATE | 2248 |
1 | CONFIRMED | 2293 |
2 | FALSE POSITIVE | 5023 |
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT Exoplanet_Info.kepoi_name, Stellar_Parameters.koi_steff
FROM Exoplanet_Info, Stellar_Parameters
WHERE Exoplanet_Info.kepoi_name = Stellar_Parameters.kepoi_name
AND Exoplanet_Info.koi_disposition = 'CONFIRMED'
''')
query = cursor.fetchall()
query_5 = pd.DataFrame(query,columns=['name','star_temp'])
query_5
name | star_temp | |
---|---|---|
0 | K00752.01 | 5455 |
1 | K00752.02 | 5455 |
2 | K00755.01 | 6031 |
3 | K00756.01 | 6046 |
4 | K00756.02 | 6046 |
... | ... | ... |
2288 | K03014.01 | 6161 |
2289 | K03106.01 | 5866 |
2290 | K00285.03 | 5862 |
2291 | K03417.01 | 5915 |
2292 | K04385.02 | 5119 |
2293 rows × 2 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT Exoplanet_Info.kepoi_name, Transit_properties.koi_teq
FROM Exoplanet_Info, Transit_properties
WHERE Exoplanet_Info.kepoi_name = Transit_properties.kepoi_name
AND Exoplanet_Info.koi_disposition = 'CONFIRMED'
''')
query = cursor.fetchall()
query_6 = pd.DataFrame(query,columns=['name','planet_temp'])
query_6
name | planet_temp | |
---|---|---|
0 | K00752.01 | 793 |
1 | K00752.02 | 443 |
2 | K00755.01 | 1406 |
3 | K00756.01 | 835 |
4 | K00756.02 | 1160 |
... | ... | ... |
2288 | K03014.01 | 441 |
2289 | K03106.01 | 1844 |
2290 | K00285.03 | 637 |
2291 | K03417.01 | 415 |
2292 | K04385.02 | 209 |
2293 rows × 2 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT Exoplanet_Info.koi_disposition,
AVG(Transit_properties.koi_teq),
AVG(Transit_properties.koi_period)
FROM Exoplanet_Info, Transit_properties
WHERE Exoplanet_Info.kepoi_name = Transit_properties.kepoi_name
AND Exoplanet_Info.koi_disposition NOT IN ('koi_disposition')
GROUP BY Exoplanet_Info.koi_disposition
''')
query = cursor.fetchall()
query_7 = pd.DataFrame(query,columns=['Disposition','Average Planet Temp (K)','Average Orbital Period (days)'])
query_7
Disposition | Average Planet Temp (K) | Average Orbital Period (days) | |
---|---|---|---|
0 | CANDIDATE | 857.690391 | 130.523850 |
1 | CONFIRMED | 838.759703 | 27.063211 |
2 | FALSE POSITIVE | 1221.435596 | 73.312227 |
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT Exoplanet_Info.kepoi_name, Exoplanet_Info.koi_disposition,
Transit_properties.koi_teq,
Transit_properties.koi_period
FROM Exoplanet_Info, Transit_properties
WHERE Exoplanet_Info.kepoi_name = Transit_properties.kepoi_name
AND Exoplanet_Info.koi_disposition NOT IN ('koi_disposition')
AND Transit_properties.koi_teq BETWEEN 250 AND 350
AND Transit_properties.koi_period BETWEEN 350 AND 380
''')
query = cursor.fetchall()
query_8 = pd.DataFrame(query,columns=['Name','Disposition','Planet Temp (K)','Orbital Period (days)'])
query_8
Name | Disposition | Planet Temp (K) | Orbital Period (days) | |
---|---|---|---|---|
0 | K02810.02 | FALSE POSITIVE | 303.0 | 354.951180 |
1 | K02342.02 | FALSE POSITIVE | 325.0 | 354.781940 |
2 | K05922.01 | FALSE POSITIVE | 287.0 | 367.074460 |
3 | K00465.01 | FALSE POSITIVE | 283.0 | 363.706662 |
4 | K04065.01 | FALSE POSITIVE | 267.0 | 372.373690 |
... | ... | ... | ... | ... |
92 | K08172.01 | FALSE POSITIVE | 284.0 | 371.518520 |
93 | K08268.01 | FALSE POSITIVE | 260.0 | 368.522400 |
94 | K08293.01 | FALSE POSITIVE | 258.0 | 378.654840 |
95 | K07982.01 | CANDIDATE | 265.0 | 376.379890 |
96 | K04260.03 | FALSE POSITIVE | 304.0 | 358.660440 |
97 rows × 4 columns
# Create a connection to the database.
conn = sqlite3.connect('Kepler.db')
# Create a cursor.
cursor = conn.cursor()
cursor.execute('''SELECT Exoplanet_Info.kepoi_name, Stellar_Parameters.koi_steff, Transit_properties.koi_teq
FROM Exoplanet_Info, Stellar_Parameters, Transit_properties
WHERE Exoplanet_Info.kepoi_name = Stellar_Parameters.kepoi_name
AND Exoplanet_Info.kepoi_name = Transit_properties.kepoi_name
AND Exoplanet_Info.koi_disposition = 'CONFIRMED'
''')
query = cursor.fetchall()
query_9 = pd.DataFrame(query,columns=['name','star_temp', 'planet_temp'])
query_9['star_temp'] = pd.to_numeric(query_9['star_temp'])
query_9['planet_temp'] = pd.to_numeric(query_9['planet_temp'])
query_9 = query_9.dropna()
query_9
name | star_temp | planet_temp | |
---|---|---|---|
0 | K00752.01 | 5455.0 | 793.0 |
1 | K00752.02 | 5455.0 | 443.0 |
2 | K00755.01 | 6031.0 | 1406.0 |
3 | K00756.01 | 6046.0 | 835.0 |
4 | K00756.02 | 6046.0 | 1160.0 |
... | ... | ... | ... |
2288 | K03014.01 | 6161.0 | 441.0 |
2289 | K03106.01 | 5866.0 | 1844.0 |
2290 | K00285.03 | 5862.0 | 637.0 |
2291 | K03417.01 | 5915.0 | 415.0 |
2292 | K04385.02 | 5119.0 | 209.0 |
2292 rows × 3 columns
cursor.close()
# scatterplot confirmed and koi_score query 1
f1, ax = plt.subplots(figsize=(5,6))
sns.set()
sns.set_style('whitegrid')
sns.set_style('ticks', {"xtick.major.size": 8,
"ytick.major.size": 8})
sns.histplot(query_1.score)
plt.title("Score Distribution of Confirmed Planets", weight= "bold", size = 'large')
plt.xlabel('Score')
plt.xlim(0.8, 1)
plt.show()
The plot above shows the distribution of confidence scores among the exoplanets that are within the confirmed disposition. As you can see, a lare majority of the scores are at or near 1, meaning near absolute confidence.
#Violin chart query 2
f3, ax=plt.subplots(figsize=(8,9))
sns.set()
sns.set_style('whitegrid')
plot3 = sns.violinplot(x='disposition', y='score', data=query_2)
plt.title("Distribution of Planet Disposition with Score > 0.95", weight= "bold", size = 'large')
plt.xlabel('Disposition')
plt.ylabel('Score')
plt.show()
The plot above shows a similar distribution, but has been expanded to look at all dispositions of the exoplanets. The confirmed category appears to be more heavily distributed towards the upper end of the specturm, seeing as the plot is wider near one. The Candidate category has a somewhat similar distribution, but doesn't appear to have nearly as many observations with a value at or near one.
#query 7 scatter or bar
f2, ax=plt.subplots(figsize=(8,9))
sns.set()
sns.set_style('whitegrid')
plot2 = sns.barplot(x = 'Disposition', y = 'Average Orbital Period (days)', data = query_7)
plt.title("Average Orbital Periods per Disposition", weight= "bold", size = 'large')
plt.show()
The plot above shows another distribution amongst the 3 disposition classes, but this time looking at the average orbital period, or it other words, how long 1 year is for each class. On average, the confirmed class has an orbital period of close to 30 days, implying that the planets within the confirmed class are very close to their host star. If this is true, we would expect the planet temperature to be quite high. The candidate class has a much higher average orbital period. Perhaps planets close to the local star are easier to identify as being an exoplanet or not, where as celestial bodies further away present challenges to the detection methods used in identifying exoplanets.
#query 8 scatter
temps = np.arange(250, 350, 10)
f4, ax = plt.subplots(figsize=(8,9))
sns.set()
sns.set_style('whitegrid')
plot = sns.stripplot(x = 'Planet Temp (K)', y = 'Orbital Period (days)', data=query_8, hue='Disposition')
for ind, label in enumerate(plot.get_xticklabels()):
if ind % 10 == 0: # every 10th label is kept
label.set_visible(True)
else:
label.set_visible(False)
plt.title("Planets with Temperatures and Orbital Periods Like Earth", weight= "bold", size = 'large')
plt.legend(loc='upper right')
plt.show()
The above plot is looking at a scatter plot distribution of the planets that had both an earth like temperature, and an earth like distance from the sun, as represented by the orbital period of the planet. It's worth noting that of all the points on the plot, only one of them is in the confirmed class, and most of the points are classified as false postives. Finding an earth like planet, even among the Kepler data, truly is like finding a needle in a haystack.
#query 9
f5, ax=plt.subplots(figsize=(8,9))
sns.set()
sns.set_style('whitegrid')
plot5 = plt.hist2d(x='star_temp', y='planet_temp', data=query_9, cmap = 'rocket_r')
plt.colorbar()
plt.title("Star and Planet Temperature of Confirmed Planets", weight= "bold", size = 'large')
plt.xlabel('Star Temp (K)')
plt.ylabel('Planet Temp (K)')
plt.show()
The plot above does a good job visualizing the distribution of star and planet temperatures of the confirmed planet category. As mentioned earlier, a large majority of the confirmed planets had a lower orbital period, and thus a high planet temperature, or so intuition would lead us to believe. The plot above confirms this. A large majority of the planets have temperature well above 300 K which would be roughly 27 degrees celcius. Any more than 300, and we start looking at planets that are nearly uninhabitable for human life. That being said, there do appear to be some planets that may fall within the range required by humans.