Commit 7fdb213b authored by Maximilian Dolling's avatar Maximilian Dolling
Browse files

fixes in readme, changed dependencies handling

parent 1e73e5a1
Pipeline #11928 passed with stage
in 40 seconds
...@@ -23,7 +23,7 @@ pip install -r requirements.txt ...@@ -23,7 +23,7 @@ pip install -r requirements.txt
You can run the script as follows: You can run the script as follows:
```bash ```bash
python src/astronaut-analysis.py python src/astronaut_analysis.py
``` ```
The script processes the [astronauts data set]( data/astronauts.json) and stores the plots in the directory `results`. The script processes the [astronauts data set]( data/astronauts.json) and stores the plots in the directory `results`.
......
pandas==1.0.5 pandas == 1.0.5
matplotlib==3.2.2 matplotlib == 3.2.2
\ No newline at end of file \ No newline at end of file
...@@ -7,10 +7,10 @@ from datetime import date ...@@ -7,10 +7,10 @@ from datetime import date
from os import makedirs from os import makedirs
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import matplotlib.pyplot
import pandas as pd import pandas
plt.style.use("ggplot") matplotlib.pyplot.style.use("ggplot")
_ASTRONAUT_DATA = "data/astronauts.json" _ASTRONAUT_DATA = "data/astronauts.json"
_OUTPUT_PATH = "results" _OUTPUT_PATH = "results"
...@@ -18,7 +18,7 @@ _OUTPUT_PATH = "results" ...@@ -18,7 +18,7 @@ _OUTPUT_PATH = "results"
## ##
# Data preparation functions # Data preparation functions
## ##
def prepare_data_set(data_frame: pd.DataFrame) -> pd.DataFrame: def prepare_data_set(data_frame: pandas.DataFrame) -> pandas.DataFrame:
""" """
Prepares the raw data by: Prepares the raw data by:
- dropping NaN's - dropping NaN's
...@@ -37,9 +37,9 @@ def prepare_data_set(data_frame: pd.DataFrame) -> pd.DataFrame: ...@@ -37,9 +37,9 @@ def prepare_data_set(data_frame: pd.DataFrame) -> pd.DataFrame:
# Set pandas dtypes for columns with date or time # Set pandas dtypes for columns with date or time
data_frame = data_frame.dropna(subset=["time_in_space"]) data_frame = data_frame.dropna(subset=["time_in_space"])
data_frame["time_in_space"] = data_frame["time_in_space"].astype(int) data_frame["time_in_space"] = data_frame["time_in_space"].astype(int)
data_frame["time_in_space"] = pd.to_timedelta(data_frame["time_in_space"], unit="m") data_frame["time_in_space"] = pandas.to_timedelta(data_frame["time_in_space"], unit="m")
data_frame["birthdate"] = pd.to_datetime(data_frame["birthdate"]) data_frame["birthdate"] = pandas.to_datetime(data_frame["birthdate"])
data_frame["date_of_death"] = pd.to_datetime(data_frame["date_of_death"]) data_frame["date_of_death"] = pandas.to_datetime(data_frame["date_of_death"])
data_frame.sort_values("birthdate", inplace=True) data_frame.sort_values("birthdate", inplace=True)
# Calculate extra columns from the original data # Calculate extra columns from the original data
...@@ -76,12 +76,12 @@ def is_alive(date_of_death) -> bool: ...@@ -76,12 +76,12 @@ def is_alive(date_of_death) -> bool:
Returns: Returns:
bool bool
""" """
if pd.isnull(date_of_death): if pandas.isnull(date_of_death):
return True return True
return False return False
def calculate_age(born: pd.Timestamp) -> int: def calculate_age(born: pandas.Timestamp) -> int:
""" """
Calculates an age from a date. Calculates an age from a date.
...@@ -92,14 +92,14 @@ def calculate_age(born: pd.Timestamp) -> int: ...@@ -92,14 +92,14 @@ def calculate_age(born: pd.Timestamp) -> int:
int int
""" """
if not isinstance(born, pd.Timestamp): if not isinstance(born, pandas.Timestamp):
raise TypeError(f'expected {pd.Timestamp}, got {type(born)}') raise TypeError(f'expected {pandas.Timestamp}, got {type(born)}')
today = date.today() today = date.today()
return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
def died_with_age(row: pd.Series): def died_with_age(row: pandas.Series):
""" """
Calculates an age from a birthdate and date_of_death. Calculates an age from a birthdate and date_of_death.
...@@ -109,7 +109,7 @@ def died_with_age(row: pd.Series): ...@@ -109,7 +109,7 @@ def died_with_age(row: pd.Series):
Returns: Returns:
int int
""" """
if pd.isnull(row["date_of_death"]): if pandas.isnull(row["date_of_death"]):
return None return None
born = row["birthdate"] born = row["birthdate"]
today = row["date_of_death"] today = row["date_of_death"]
...@@ -141,7 +141,7 @@ def create_age_histogram(age_data_frame, died_data_frame): ...@@ -141,7 +141,7 @@ def create_age_histogram(age_data_frame, died_data_frame):
in the categories 'age at dead' and 'age alive'. in the categories 'age at dead' and 'age alive'.
""" """
fig, axs = plt.subplots(1, 1) fig, axs = matplotlib.pyplot.subplots(1, 1)
axs.hist( axs.hist(
[died_data_frame["died_with_age"], age_data_frame["age"]], [died_data_frame["died_with_age"], age_data_frame["age"]],
bins=70, bins=70,
...@@ -160,16 +160,16 @@ def create_age_boxplot(age_data_frame, died_data_frame): ...@@ -160,16 +160,16 @@ def create_age_boxplot(age_data_frame, died_data_frame):
in the categories dead and alive. in the categories dead and alive.
""" """
fig, axs = plt.subplots(1, 1) fig, axs = matplotlib.pyplot.subplots(1, 1)
axs.boxplot([died_data_frame["died_with_age"], age_data_frame["age"]]) axs.boxplot([died_data_frame["died_with_age"], age_data_frame["age"]])
axs.set_title("Age distribution; Dead vs. Alive astronauts") axs.set_title("Age distribution; Dead vs. Alive astronauts")
axs.set_xlabel("Category") axs.set_xlabel("Category")
plt.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"]) matplotlib.pyplot.setp(axs, xticks=[1, 2], xticklabels=["Dead", "Alive"])
axs.set_ylabel("Age") axs.set_ylabel("Age")
save(fig, "boxplot.png") save(fig, "boxplot.png")
def save(fig: plt.Figure, filename: str): def save(fig: matplotlib.pyplot.Figure, filename: str):
""" """
Saves a matplotlib Figure to a file. It overwrites existing files with the same filename. Saves a matplotlib Figure to a file. It overwrites existing files with the same filename.
...@@ -185,7 +185,7 @@ def perform_analysis(): ...@@ -185,7 +185,7 @@ def perform_analysis():
# Set up directory structure and preprocess data # Set up directory structure and preprocess data
makedirs(_OUTPUT_PATH, exist_ok=True) makedirs(_OUTPUT_PATH, exist_ok=True)
data_frame = pd.read_json(Path(_ASTRONAUT_DATA).resolve()) data_frame = pandas.read_json(Path(_ASTRONAUT_DATA).resolve())
data_frame = prepare_data_set(data_frame) data_frame = prepare_data_set(data_frame)
# Male humans in space # Male humans in space
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment