nielsr HF staff commited on
Commit
2adbdb9
1 Parent(s): 1396667

More improvements

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +52 -43
  3. load_dataframe.py +22 -15
.gitignore CHANGED
@@ -1 +1,2 @@
1
- env/
 
 
1
+ env/
2
+ *.pyc
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import numpy as np
6
  import matplotlib.pyplot as plt
7
 
 
8
  from load_dataframe import get_data
9
 
10
 
@@ -48,7 +49,34 @@ def aggregated_data(df, aggregation_level="week"):
48
  st.pyplot(plt)
49
 
50
 
51
- def display_data(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
53
  num_artifacts = df['has_artifact'].sum()
54
  percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
@@ -67,32 +95,13 @@ def display_data(df):
67
  """)
68
 
69
  st.write("Papers with at least one artifact")
70
- st.data_editor(df[df['has_artifact']],
71
- hide_index=True,
72
- column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
73
- column_config={"github": st.column_config.LinkColumn(),
74
- "paper_page": st.column_config.LinkColumn(),
75
- "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
76
- width=2000,
77
- key="papers_with_artifacts")
78
-
79
  st.write("Papers without artifacts")
80
- st.data_editor(df[~df['has_artifact']],
81
- hide_index=True,
82
- column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
83
- column_config={"github": st.column_config.LinkColumn(),
84
- "paper_page": st.column_config.LinkColumn()},
85
- width=2000,
86
- key="papers_without_artifacts")
87
 
88
  st.write("Papers with a HF mention in README but no artifacts")
89
- st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
90
- hide_index=True,
91
- column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
92
- column_config={"github": st.column_config.LinkColumn(),
93
- "paper_page": st.column_config.LinkColumn()},
94
- width=2000,
95
- key="papers_with_hf_mention_no_artifacts")
96
 
97
 
98
  def main():
@@ -102,36 +111,29 @@ def main():
102
  st.sidebar.title("Navigation")
103
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
104
 
105
- # TODO use this instead
106
- df = get_data()
107
-
108
- print(df.head())
109
-
110
- # df = pd.read_csv('daily_papers_enriched (3).csv')
111
- df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
112
- # Use date as index
113
- # df = df.set_index('date')
114
- # df.index = pd.to_datetime(df.index)
115
- df = df.sort_index()
116
-
117
  if selection == "Daily/weekly/monthly data":
118
  # Button to select day, month or week
119
  # Add streamlit selectbox.
120
  view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
121
 
122
  if view_level == "day":
 
 
 
123
  # make a button to select the day, defaulting to today
124
  day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
125
  # convert to the day of a Pandas Timestamp
126
  day = pd.Timestamp(day)
127
 
128
- df = df[df.index.date == day.date()]
129
 
130
  st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
131
-
132
- display_data(df)
133
 
134
  elif view_level == "week":
 
 
 
135
  # make a button to select the week
136
  week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
137
 
@@ -139,13 +141,16 @@ def main():
139
  df['week'] = df.index.isocalendar().week
140
 
141
  # Filter the dataframe for the desired week number
142
- df = df[df['week'] == week_number]
143
 
144
  st.write(f"Showing data for week {week_number}")
145
 
146
- display_data(df)
147
 
148
  elif view_level == "month":
 
 
 
149
  # make a button to select the month, defaulting to current month
150
  month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
151
  year_str = st.selectbox("Select year", options=["2024"])
@@ -160,13 +165,17 @@ def main():
160
  # Convert month string to number
161
  month = month_map[month_str]
162
  year = int(year_str)
163
- df = df[(df.index.month == month) & (df.index.year == year)]
164
 
165
  st.write(f"Showing data for {month_str} {year_str}")
166
 
167
- display_data(df)
168
 
169
  elif selection == "Aggregated data":
 
 
 
 
170
  aggregated_data(df)
171
  aggregated_data(df, aggregation_level="month")
172
 
 
5
  import numpy as np
6
  import matplotlib.pyplot as plt
7
 
8
+ from datasets import Dataset
9
  from load_dataframe import get_data
10
 
11
 
 
49
  st.pyplot(plt)
50
 
51
 
52
+ def show_data_editor(df: pd.DataFrame, key: str):
53
+ edited_df = st.data_editor(df,
54
+ hide_index=True,
55
+ column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
56
+ column_config={"github": st.column_config.LinkColumn(),
57
+ "paper_page": st.column_config.LinkColumn(),
58
+ "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
59
+ width=2000,
60
+ key=key)
61
+
62
+ # Check if the dataframe has been edited
63
+ # TODO this is wrong
64
+ # rather we should probably do a merge-join (overwriting the edited rows) and then save the new dataframe
65
+ # if not edited_df.equals(df):
66
+ # save_data(edited_df)
67
+ # st.success("Changes saved successfully!")
68
+
69
+
70
+ def save_data(df: pd.DataFrame):
71
+ # load as HF dataset
72
+ dataset = Dataset.from_pandas(df)
73
+
74
+ dataset.push_to_hub("nielsr/daily-papers-enriched")
75
+
76
+ return
77
+
78
+
79
+ def display_data(df: pd.DataFrame):
80
  df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
81
  num_artifacts = df['has_artifact'].sum()
82
  percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
 
95
  """)
96
 
97
  st.write("Papers with at least one artifact")
98
+ show_data_editor(df[df['has_artifact']], key="papers_with_artifacts")
99
+
 
 
 
 
 
 
 
100
  st.write("Papers without artifacts")
101
+ show_data_editor(df[~df['has_artifact']], key="papers_without_artifacts")
 
 
 
 
 
 
102
 
103
  st.write("Papers with a HF mention in README but no artifacts")
104
+ show_data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])], key="papers_with_hf_mention_no_artifacts")
 
 
 
 
 
 
105
 
106
 
107
  def main():
 
111
  st.sidebar.title("Navigation")
112
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
113
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  if selection == "Daily/weekly/monthly data":
115
  # Button to select day, month or week
116
  # Add streamlit selectbox.
117
  view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
118
 
119
  if view_level == "day":
120
+ # get the latest dataframe
121
+ df = get_data()
122
+
123
  # make a button to select the day, defaulting to today
124
  day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
125
  # convert to the day of a Pandas Timestamp
126
  day = pd.Timestamp(day)
127
 
128
+ filtered_df = df[df.index.date == day.date()]
129
 
130
  st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
131
+ display_data(df=filtered_df)
 
132
 
133
  elif view_level == "week":
134
+ # get the latest dataframe
135
+ df = get_data()
136
+
137
  # make a button to select the week
138
  week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
139
 
 
141
  df['week'] = df.index.isocalendar().week
142
 
143
  # Filter the dataframe for the desired week number
144
+ filtered_df = df[df['week'] == week_number]
145
 
146
  st.write(f"Showing data for week {week_number}")
147
 
148
+ display_data(df=filtered_df)
149
 
150
  elif view_level == "month":
151
+ # get the latest dataframe
152
+ df = get_data()
153
+
154
  # make a button to select the month, defaulting to current month
155
  month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
156
  year_str = st.selectbox("Select year", options=["2024"])
 
165
  # Convert month string to number
166
  month = month_map[month_str]
167
  year = int(year_str)
168
+ filtered_df = df[(df.index.month == month) & (df.index.year == year)]
169
 
170
  st.write(f"Showing data for {month_str} {year_str}")
171
 
172
+ display_data(df=filtered_df)
173
 
174
  elif selection == "Aggregated data":
175
+
176
+ # get the latest dataframe
177
+ df = get_data()
178
+
179
  aggregated_data(df)
180
  aggregated_data(df, aggregation_level="month")
181
 
load_dataframe.py CHANGED
@@ -20,9 +20,11 @@ class PaperInfo:
20
  num_comments: int
21
 
22
 
23
- def get_df(start_date: str, end_date: str) -> pd.DataFrame:
24
  """
25
  Load the initial dataset as a Pandas dataframe.
 
 
26
  """
27
 
28
  df = pd.merge(
@@ -45,8 +47,9 @@ def get_df(start_date: str, end_date: str) -> pd.DataFrame:
45
  # set date as index
46
  df = df.set_index('date')
47
  df.index = pd.to_datetime(df.index)
48
- # only include data between start_date and end_date
49
- df = df[(df.index >= start_date) & (df.index <= end_date)]
 
50
 
51
  return df
52
 
@@ -150,8 +153,8 @@ def check_hf_mention(batch):
150
  if response.status_code == 200:
151
  # get text
152
  text = response.text
153
- if "huggingface" in text.lower() or "hugging face" in text.lower():
154
- hf_mention = 1
155
 
156
  hf_mentions.append(hf_mention)
157
 
@@ -179,18 +182,14 @@ def process_data(start_date: str, end_date: str) -> pd.DataFrame:
179
  dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
180
 
181
  # return as Pandas dataframe
 
182
  dataframe = dataset.to_pandas()
183
-
184
- # convert date column to datetime
185
- dataframe['date'] = pd.to_datetime(dataframe['date'])
186
-
187
- print("First few rows of the dataset:")
188
- print(dataframe.head())
189
 
190
  return dataframe
191
 
192
 
193
- @st.cache_data
194
  def get_data() -> pd.DataFrame:
195
 
196
  # step 1: load pre-processed data
@@ -200,14 +199,22 @@ def get_data() -> pd.DataFrame:
200
  df.index = pd.to_datetime(df.index)
201
 
202
  # step 2: check how much extra data we need to process
203
- latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
204
- today = pd.Timestamp.today().strftime('%d-%m-%Y')
 
 
 
205
 
206
  # step 3: process the missing data
207
  if latest_day < today:
208
  print(f"Processing data from {latest_day} to {today}")
209
  new_df = process_data(start_date=latest_day, end_date=today)
210
- new_df = new_df[new_df.index > latest_day]
 
 
 
211
  df = pd.concat([df, new_df])
212
 
 
 
213
  return df
 
20
  num_comments: int
21
 
22
 
23
+ def get_df(start_date: str = None, end_date: str = None) -> pd.DataFrame:
24
  """
25
  Load the initial dataset as a Pandas dataframe.
26
+
27
+ One can optionally specify a start_date and end_date to only include data between these dates.
28
  """
29
 
30
  df = pd.merge(
 
47
  # set date as index
48
  df = df.set_index('date')
49
  df.index = pd.to_datetime(df.index)
50
+ if start_date is not None and end_date is not None:
51
+ # only include data between start_date and end_date
52
+ df = df[(df.index >= start_date) & (df.index <= end_date)]
53
 
54
  return df
55
 
 
153
  if response.status_code == 200:
154
  # get text
155
  text = response.text
156
+ if "huggingface" in text.lower() or "hugging face" in text.lower():
157
+ hf_mention = 1
158
 
159
  hf_mentions.append(hf_mention)
160
 
 
182
  dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
183
 
184
  # return as Pandas dataframe
185
+ # making sure that the date is set as index
186
  dataframe = dataset.to_pandas()
187
+ dataframe = dataframe.set_index('date')
188
+ dataframe.index = pd.to_datetime(dataframe.index)
 
 
 
 
189
 
190
  return dataframe
191
 
192
 
 
193
  def get_data() -> pd.DataFrame:
194
 
195
  # step 1: load pre-processed data
 
199
  df.index = pd.to_datetime(df.index)
200
 
201
  # step 2: check how much extra data we need to process
202
+ latest_day = df.iloc[-1].name.strftime('%Y-%m-%d')
203
+ today = pd.Timestamp.today().strftime('%Y-%m-%d')
204
+
205
+ print("Latest day:", latest_day)
206
+ print("Today:", today)
207
 
208
  # step 3: process the missing data
209
  if latest_day < today:
210
  print(f"Processing data from {latest_day} to {today}")
211
  new_df = process_data(start_date=latest_day, end_date=today)
212
+
213
+ print("Original df:", df.head())
214
+ print("New df:", new_df.head())
215
+
216
  df = pd.concat([df, new_df])
217
 
218
+ df = df.sort_index()
219
+
220
  return df