Error"utf-8' codec can't decode byte 0x96 in Python using MacOS

import pandas as pd,re,emoji,os,string
from textblob import TextBlob
import dateutil.parser as dparser

keyword_bank = None
keyword_files_paths = []


def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
def deEmojify(text):
    text = str(text).encode("utf-8") 
    text =  emoji.get_emoji_regexp().sub(r'', text.decode('utf-8'))
    try:
        text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    except:pass
    
    return text
def extractDateFromString(text):
    try:
        return dparser.parse(str(text),fuzzy=True).date()
    except:
        return text
def getWordCount(text):
    try:
        return len(str(text).split())
    except:
        len(str(text))
               
def filterText(text): 
    newstr = ''
    string1 = text 
    for char in text:
        if str(char).isalpha():
            newstr = newstr+char
        else:
            newstr = newstr+" "
            
    return newstr.strip().replace("  "," ").replace("@",'')
    # for index,x in enumerate(string1):
    #     if index<len(string1)-1 and index>0:
    #         if str(string1[index-1]).isnumeric() or  str(string1[index+1]).isnumeric() :
    #             continue
    #     newstr = newstr +x
    # chrs = ['(',')','-',';','%',*[str(x) for x in range(0,10)], '\n','  ']
    # for x in chrs:
    #     newstr = newstr.replace(x,' ') 
    # return newstr.strip()  

def readTweetsFile(path):
    file_name = os.path.basename(path)
    if '.xlsx' in str(file_name).lower():
        df = pd.read_excel(path)
    elif '.csv' in str(file_name).lower():
        df = pd.read_csv(path)
    elif '.txt' in  str(file_name).lower():
        tweets = str(open(path, 'r', ).read()).strip().replace('\n',' ')
        tweets = filterText(text=tweets)
        tweets = [str(x).strip() for x in tweets.split('.')]
        df = pd.DataFrame(tweets,columns=['Content'])
    return df

def getKeywordBankDict(paths):
    keyword_bank = {}
    for path in paths:
        file_name = os.path.basename(path)
        if '.txt' in file_name:
            with open(path, 'r', encoding="utf-8") as file: 
                lines = [str(x).strip()  for x in file.readlines()]
                lines = list(dict.fromkeys([x for x in lines if bool(x)]))
                keyword_bank[file_name] = lines

    return keyword_bank  

def checkOccurance(text,file_name):
    keywords = keyword_bank[file_name]
    occurance = [word for word in keywords if str(word).strip().lower() in str(text).strip().lower()]
    if len(occurance)>0:
        return occurance[0] 
    
    return 0


results_folder_path = None
def checkResultsFolder(): 
    global results_folder_path
    results_folder_path = os.path.join( os.getcwd() , 'Results')
    if not os.path.exists(results_folder_path):
        os.makedirs(results_folder_path)


checkResultsFolder()








def tweetFileReportGenerator(path,keyword_files_paths,company_name=None,date=None,is_text_file=False):
    checkResultsFolder()
    df = readTweetsFile(path)
    # df = df.head(800)
    df['Content'] = df['Content'].apply(deEmojify)
 
    if not is_text_file:
        df = df.drop(columns=['PostID','Ticks', 'TweetUrl','PostID','RetweetNum', 'UserHandle', 'LikeNum', 'UserID', 'UserUrl','Location',])
        df['Time'] = df['Time'].apply(extractDateFromString)
        df.columns = df.columns.str.replace('UserName', 'Company Name') 
        df.columns = df.columns.str.replace('Time', 'Date') 
        # df.columns = df.columns.str.replace('Content', 'Tweet') 
    else:
        df['Date'] = [str(date)]*len(df['Content'])
        df['Company Name'] = [str(company_name)]*len(df['Content'])
 
    df['Length'] = df['Content'].apply(getWordCount)
    df['Polarity'] = df['Content'].apply(getPolarity)
    df['Subjectivity'] = df['Content'].apply(getSubjectivity)
    
    
    for path in keyword_files_paths[:]:
        file_name = os.path.basename(path)
        df[str(file_name).split('.')[0]] = df['Content'].apply(checkOccurance, args=[str(file_name)])

    # df.columns = df.columns.str.replace('Content', 'Tweet')
    
    
    df_columns = list(df.columns)
    df = df.groupby('Company Name') 
    df = [{'file_name':name,'data':list(data.values.tolist())} for name,data in df]
    
    for dataset in df:
        file_name = dataset['file_name']
        data = dataset['data']
        file_path = os.path.join(results_folder_path,file_name)
        new_df = pd.DataFrame(data=data,columns=df_columns)
        new_df.to_csv(file_path+'.csv',index=False)




# Interface
print("1. CSV or Excel")
print("2. Text file")
tweet_data_file_type = int(input("Enter file type [1,2] = "))
while tweet_data_file_type not in [1,2]:
    print("1. CSV or Excel")
    print("2. Text file")
    tweet_data_file_type = int(input("Enter file type [1,2] = "))


tweet_data_file_path = input("Enter tweets data file path = ")
while not os.path.exists(tweet_data_file_path):
    print("* Enter a valid path for tweets data file")
    tweet_data_file_path = input("Enter tweets data file path = ")


keyword_files_paths = []
total_keyword_files = int(input("Enter total number of keyword files = "))
while total_keyword_files < 1 :
    print("* Enter a valid number > 0")
    total_keyword_files = int(input("Enter total number of keyword files = "))

file_index = 1
while file_index <=total_keyword_files:
    path = input(f"Enter path for keyword file {file_index} = ")
    if path in keyword_files_paths:
        print(f"* Path ({path}) is already provided !")
        continue
    while  not os.path.exists(path):
        print(f"* Enter a valid path for keyword file {file_index}")
        path = input(f"Enter path for keyword file {file_index} = ")
    file_index = file_index+1
    keyword_files_paths.append(path)    

if tweet_data_file_type == 2:
    company_name = input("Enter company name = ")
    date = input("Enter date = ")

keyword_bank = getKeywordBankDict(paths=keyword_files_paths[:])

print("Processing ...")
if tweet_data_file_type == 2: 
    tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths,company_name=company_name,date=date,is_text_file=True)
else:
    tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths)

 

I replaced utf-8 with different options but it still not working any advise please? I am using MacOs... anychanse to fix this please? This my first day using stackoverflow sorry if my post is not professional as I am new in programming and not aware of fixing errors. Thank you so much for your kind support



Comments

Popular posts from this blog

Spring Elasticsearch Operations

Object oriented programming concepts (OOPs)

Spring Boot and Vaadin : Filtering rows in Vaadin Grid