Error"utf-8' codec can't decode byte 0x96 in Python using MacOS
import pandas as pd,re,emoji,os,string
from textblob import TextBlob
import dateutil.parser as dparser
keyword_bank = None
keyword_files_paths = []
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
return TextBlob(text).sentiment.polarity
def deEmojify(text):
text = str(text).encode("utf-8")
text = emoji.get_emoji_regexp().sub(r'', text.decode('utf-8'))
try:
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
except:pass
return text
def extractDateFromString(text):
try:
return dparser.parse(str(text),fuzzy=True).date()
except:
return text
def getWordCount(text):
try:
return len(str(text).split())
except:
len(str(text))
def filterText(text):
newstr = ''
string1 = text
for char in text:
if str(char).isalpha():
newstr = newstr+char
else:
newstr = newstr+" "
return newstr.strip().replace(" "," ").replace("@",'')
# for index,x in enumerate(string1):
# if index<len(string1)-1 and index>0:
# if str(string1[index-1]).isnumeric() or str(string1[index+1]).isnumeric() :
# continue
# newstr = newstr +x
# chrs = ['(',')','-',';','%',*[str(x) for x in range(0,10)], '\n',' ']
# for x in chrs:
# newstr = newstr.replace(x,' ')
# return newstr.strip()
def readTweetsFile(path):
file_name = os.path.basename(path)
if '.xlsx' in str(file_name).lower():
df = pd.read_excel(path)
elif '.csv' in str(file_name).lower():
df = pd.read_csv(path)
elif '.txt' in str(file_name).lower():
tweets = str(open(path, 'r', ).read()).strip().replace('\n',' ')
tweets = filterText(text=tweets)
tweets = [str(x).strip() for x in tweets.split('.')]
df = pd.DataFrame(tweets,columns=['Content'])
return df
def getKeywordBankDict(paths):
keyword_bank = {}
for path in paths:
file_name = os.path.basename(path)
if '.txt' in file_name:
with open(path, 'r', encoding="utf-8") as file:
lines = [str(x).strip() for x in file.readlines()]
lines = list(dict.fromkeys([x for x in lines if bool(x)]))
keyword_bank[file_name] = lines
return keyword_bank
def checkOccurance(text,file_name):
keywords = keyword_bank[file_name]
occurance = [word for word in keywords if str(word).strip().lower() in str(text).strip().lower()]
if len(occurance)>0:
return occurance[0]
return 0
results_folder_path = None
def checkResultsFolder():
global results_folder_path
results_folder_path = os.path.join( os.getcwd() , 'Results')
if not os.path.exists(results_folder_path):
os.makedirs(results_folder_path)
checkResultsFolder()
def tweetFileReportGenerator(path,keyword_files_paths,company_name=None,date=None,is_text_file=False):
checkResultsFolder()
df = readTweetsFile(path)
# df = df.head(800)
df['Content'] = df['Content'].apply(deEmojify)
if not is_text_file:
df = df.drop(columns=['PostID','Ticks', 'TweetUrl','PostID','RetweetNum', 'UserHandle', 'LikeNum', 'UserID', 'UserUrl','Location',])
df['Time'] = df['Time'].apply(extractDateFromString)
df.columns = df.columns.str.replace('UserName', 'Company Name')
df.columns = df.columns.str.replace('Time', 'Date')
# df.columns = df.columns.str.replace('Content', 'Tweet')
else:
df['Date'] = [str(date)]*len(df['Content'])
df['Company Name'] = [str(company_name)]*len(df['Content'])
df['Length'] = df['Content'].apply(getWordCount)
df['Polarity'] = df['Content'].apply(getPolarity)
df['Subjectivity'] = df['Content'].apply(getSubjectivity)
for path in keyword_files_paths[:]:
file_name = os.path.basename(path)
df[str(file_name).split('.')[0]] = df['Content'].apply(checkOccurance, args=[str(file_name)])
# df.columns = df.columns.str.replace('Content', 'Tweet')
df_columns = list(df.columns)
df = df.groupby('Company Name')
df = [{'file_name':name,'data':list(data.values.tolist())} for name,data in df]
for dataset in df:
file_name = dataset['file_name']
data = dataset['data']
file_path = os.path.join(results_folder_path,file_name)
new_df = pd.DataFrame(data=data,columns=df_columns)
new_df.to_csv(file_path+'.csv',index=False)
# Interface
print("1. CSV or Excel")
print("2. Text file")
tweet_data_file_type = int(input("Enter file type [1,2] = "))
while tweet_data_file_type not in [1,2]:
print("1. CSV or Excel")
print("2. Text file")
tweet_data_file_type = int(input("Enter file type [1,2] = "))
tweet_data_file_path = input("Enter tweets data file path = ")
while not os.path.exists(tweet_data_file_path):
print("* Enter a valid path for tweets data file")
tweet_data_file_path = input("Enter tweets data file path = ")
keyword_files_paths = []
total_keyword_files = int(input("Enter total number of keyword files = "))
while total_keyword_files < 1 :
print("* Enter a valid number > 0")
total_keyword_files = int(input("Enter total number of keyword files = "))
file_index = 1
while file_index <=total_keyword_files:
path = input(f"Enter path for keyword file {file_index} = ")
if path in keyword_files_paths:
print(f"* Path ({path}) is already provided !")
continue
while not os.path.exists(path):
print(f"* Enter a valid path for keyword file {file_index}")
path = input(f"Enter path for keyword file {file_index} = ")
file_index = file_index+1
keyword_files_paths.append(path)
if tweet_data_file_type == 2:
company_name = input("Enter company name = ")
date = input("Enter date = ")
keyword_bank = getKeywordBankDict(paths=keyword_files_paths[:])
print("Processing ...")
if tweet_data_file_type == 2:
tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths,company_name=company_name,date=date,is_text_file=True)
else:
tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths)
I replaced utf-8 with different options but it still not working any advise please? I am using MacOs... anychanse to fix this please? This my first day using stackoverflow sorry if my post is not professional as I am new in programming and not aware of fixing errors. Thank you so much for your kind support
Comments
Post a Comment