seanpedrickcase commited on
Commit
90553eb
β€’
1 Parent(s): 5888649

App now retains original index following cleaning to allow for referring back to original data

Browse files
Dockerfile CHANGED
@@ -18,7 +18,8 @@ COPY requirements_aws.txt .
18
  RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
19
  && pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
20
  && pip install --no-cache-dir --target=/install bertopic==0.16.2 --no-deps \
21
- && pip install --no-cache-dir --target=/install -r requirements_aws.txt
 
22
 
23
  # Add /install to the PYTHONPATH
24
  ENV PYTHONPATH="/install:${PYTHONPATH}"
 
18
  RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
19
  && pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
20
  && pip install --no-cache-dir --target=/install bertopic==0.16.2 --no-deps \
21
+ && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
22
+ && pip install --no-cache-dir --target=/install gradio=4.44.0
23
 
24
  # Add /install to the PYTHONPATH
25
  ENV PYTHONPATH="/install:${PYTHONPATH}"
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸš€
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -5,6 +5,7 @@ import numpy as np
5
 
6
  from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
7
  from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
 
8
  from sklearn.feature_extraction.text import CountVectorizer
9
  from funcs.auth import authenticate_user, download_file_from_s3
10
 
 
5
 
6
  from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
7
  from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
8
+ from funcs.embeddings import make_or_load_embeddings
9
  from sklearn.feature_extraction.text import CountVectorizer
10
  from funcs.auth import authenticate_user, download_file_from_s3
11
 
funcs/anonymiser.py CHANGED
@@ -42,61 +42,22 @@ from presidio_anonymizer.entities import OperatorConfig
42
  from typing import List
43
 
44
  # Function to Split Text and Create DataFrame using SpaCy
45
- def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
 
 
 
46
  expanded_data = []
47
 
48
- # if not custom_delimiters:
49
- # custom_delimiters = []
50
-
51
  df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
52
 
53
- # sentencizer = Sentencizer()
54
-
55
- # new_punct_chars = sentencizer.default_punct_chars
56
- # new_punct_chars.extend(custom_delimiters)
57
-
58
- # config = {"punct_chars": new_punct_chars}
59
- # nlp.add_pipe("sentencizer", config=config)
60
-
61
  for index, row in df.iterrows():
62
  doc = nlp(row[colname])
63
  for sent in doc.sents:
64
- expanded_data.append({'document_index': row['index'], colname: sent.text})
65
  return pd.DataFrame(expanded_data)
66
 
67
- # def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
68
-
69
- # #print("Custom delimiters:", custom_delimiters)
70
-
71
- # expanded_data = []
72
- # df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
73
-
74
- # sentencizer = Sentencizer()
75
-
76
- # new_punct_chars = sentencizer.default_punct_chars
77
- # if custom_delimiters:
78
- # new_punct_chars.extend(custom_delimiters)
79
-
80
- # pattern = "(" + "|".join(re.escape(punct) for punct in new_punct_chars) + ")"
81
- # #print("Patterns:", pattern)
82
- # split_list = []
83
-
84
- # for idx, string in enumerate(df[colname]):
85
- # new_split = re.split(pattern, string)
86
- # for n, sentence in enumerate(new_split):
87
- # if sentence:
88
- # # If there is a split delimiter in the 'sentence' after, add it to the previous sentence as it will be removed at a later step
89
- # if n + 1 < len(new_split):
90
- # if new_split[n + 1]:
91
- # # If the next split is in the list of split characters, then add it to this current sentence
92
- # if new_split[n + 1] in new_punct_chars:
93
- # split_list.append({'document_index': idx, colname: sentence + new_split[n + 1]})
94
- # else:
95
- # split_list.append({'document_index': idx, colname: sentence})
96
-
97
- # return pd.DataFrame(split_list)
98
 
99
- def anon_consistent_names(df):
100
  # ## Pick out common names and replace them with the same person value
101
  df_dict = df.to_dict(orient="list")
102
 
 
42
  from typing import List
43
 
44
  # Function to Split Text and Create DataFrame using SpaCy
45
+ def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp):
46
+ '''
47
+ Expand passages into sentences using Spacy's built in NLP capabilities
48
+ '''
49
  expanded_data = []
50
 
 
 
 
51
  df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
52
 
 
 
 
 
 
 
 
 
53
  for index, row in df.iterrows():
54
  doc = nlp(row[colname])
55
  for sent in doc.sents:
56
+ expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
57
  return pd.DataFrame(expanded_data)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ def anon_consistent_names(df:pd.DataFrame):
61
  # ## Pick out common names and replace them with the same person value
62
  df_dict = df.to_dict(orient="list")
63
 
funcs/embeddings.py CHANGED
@@ -1,5 +1,6 @@
1
  import time
2
  import numpy as np
 
3
  from torch import cuda, backends, version
4
 
5
  # Check for torch cuda
@@ -12,7 +13,7 @@ if cuda.is_available():
12
  torch_device = "gpu"
13
  print("Cuda version installed is: ", version.cuda)
14
  high_quality_mode = "Yes"
15
- #os.system("nvidia-smi")
16
  else:
17
  torch_device = "cpu"
18
  high_quality_mode = "No"
 
1
  import time
2
  import numpy as np
3
+ import os
4
  from torch import cuda, backends, version
5
 
6
  # Check for torch cuda
 
13
  torch_device = "gpu"
14
  print("Cuda version installed is: ", version.cuda)
15
  high_quality_mode = "Yes"
16
+ os.system("nvidia-smi")
17
  else:
18
  torch_device = "cpu"
19
  high_quality_mode = "No"
funcs/helper_functions.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import gradio as gr
6
  import gzip
7
  import pickle
 
8
  import numpy as np
9
  from bertopic import BERTopic
10
  from datetime import datetime
@@ -129,7 +130,7 @@ def read_file(filename):
129
  print("Loading in file")
130
 
131
  if file_type == 'csv':
132
- file = pd.read_csv(filename, low_memory=False)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
133
  elif file_type == 'xlsx':
134
  file = pd.read_excel(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
135
  elif file_type == 'parquet':
 
5
  import gradio as gr
6
  import gzip
7
  import pickle
8
+ import csv
9
  import numpy as np
10
  from bertopic import BERTopic
11
  from datetime import datetime
 
130
  print("Loading in file")
131
 
132
  if file_type == 'csv':
133
+ file = pd.read_csv(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
134
  elif file_type == 'xlsx':
135
  file = pd.read_excel(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
136
  elif file_type == 'parquet':
funcs/topic_core_funcs.py CHANGED
@@ -93,6 +93,10 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
93
 
94
  in_colnames_list_first = in_colnames[0]
95
 
 
 
 
 
96
  if clean_text == "Yes":
97
  clean_tic = time.perf_counter()
98
  print("Starting data clean.")
@@ -343,6 +347,7 @@ def extract_topics(
343
  if not candidate_topics:
344
 
345
  try:
 
346
 
347
  topic_model = BERTopic( embedding_model=embedding_model,
348
  vectorizer_model=vectoriser_model,
 
93
 
94
  in_colnames_list_first = in_colnames[0]
95
 
96
+ # Reset original index to a new column so you can link it to data outputted from cleaning
97
+ if not "original_index" in data.columns:
98
+ data = data.reset_index(names="original_index")
99
+
100
  if clean_text == "Yes":
101
  clean_tic = time.perf_counter()
102
  print("Starting data clean.")
 
347
  if not candidate_topics:
348
 
349
  try:
350
+ # print("vectoriser_model:", vectoriser_model)
351
 
352
  topic_model = BERTopic( embedding_model=embedding_model,
353
  vectorizer_model=vectoriser_model,
requirements.txt CHANGED
@@ -4,7 +4,7 @@ transformers==4.41.2
4
  accelerate==0.26.1
5
  torch==2.4.0
6
  bertopic==0.16.2
7
- spacy==3.7.4
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
9
  pyarrow==14.0.2
10
  openpyxl==3.1.2
 
4
  accelerate==0.26.1
5
  torch==2.4.0
6
  bertopic==0.16.2
7
+ spacy==3.7.5
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
9
  pyarrow==14.0.2
10
  openpyxl==3.1.2