A way to access metadata on all the files due for processing.
%load_ext autoreload
%autoreload 2
%load_ext rich
In this next section I manage to get some makeshift drift detection working with evidently
:
# from evidently.dashboard import Dashboard
# from evidently.tabs import DataDriftTab
# from evidently.pipeline.column_mapping import ColumnMapping
# source1 = Path("/Users/strickvl/Desktop/NL")
# source2 = Path("/Users/strickvl/Desktop/machine-learning-flashcards")
# data_1 = get_dataframe_stats(source1)
# data_2 = get_dataframe_stats(source2)
# data_types_dict = {
# 'filename': str,
# "pagecount": np.number,
# 'has_ocr_layer': np.number,
# 'pdf_file_size_bytes': np.number,
# 'author': str,
# }
# data_1['date_created'] = pd.to_datetime(data_1['date_created'])
# data_1['date_last_modified'] = pd.to_datetime(data_1['date_last_modified'])
# data_1 = data_1.astype(data_types_dict)
# data_2['date_created'] = pd.to_datetime(data_2['date_created'])
# data_2['date_last_modified'] = pd.to_datetime(data_2['date_last_modified'])
# data_2 = data_2.astype(data_types_dict)
# cols_to_drop = ['date_created', 'date_last_modified', "filename", "author"]
# data_1.drop(cols_to_drop, axis=1, inplace=True)
# data_2.drop(cols_to_drop, axis=1, inplace=True)
# # export_stats_as_csv(source1, Path("./tryout/stats1.csv"))
# # export_stats_as_csv(source2, Path("./tryout/stats2.csv"))
# # df1 = pd.read_csv("./tryout/stats1.csv")
# # df2 = pd.read_csv("./tryout/stats1.csv")
# data_drift_report = Dashboard(tabs=[DataDriftTab()])
# # these next two lines need fixing
# # data_drift_report.calculate(data_1, data_2)
# # data_drift_report.show()
source = Path("/Users/strickvl/Desktop/machine-learning-flashcards")
display_stats(get_stats(source))