2022-02-26

Error"utf-8' codec can't decode byte 0x96 in Python using MacOS

import pandas as pd,re,emoji,os,string
from textblob import TextBlob
import dateutil.parser as dparser

keyword_bank = None
keyword_files_paths = []


def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
def deEmojify(text):
    text = str(text).encode("utf-8") 
    text =  emoji.get_emoji_regexp().sub(r'', text.decode('utf-8'))
    try:
        text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    except:pass
    
    return text
def extractDateFromString(text):
    try:
        return dparser.parse(str(text),fuzzy=True).date()
    except:
        return text
def getWordCount(text):
    try:
        return len(str(text).split())
    except:
        len(str(text))
               
def filterText(text): 
    newstr = ''
    string1 = text 
    for char in text:
        if str(char).isalpha():
            newstr = newstr+char
        else:
            newstr = newstr+" "
            
    return newstr.strip().replace("  "," ").replace("@",'')
    # for index,x in enumerate(string1):
    #     if index<len(string1)-1 and index>0:
    #         if str(string1[index-1]).isnumeric() or  str(string1[index+1]).isnumeric() :
    #             continue
    #     newstr = newstr +x
    # chrs = ['(',')','-',';','%',*[str(x) for x in range(0,10)], '\n','  ']
    # for x in chrs:
    #     newstr = newstr.replace(x,' ') 
    # return newstr.strip()  

def readTweetsFile(path):
    file_name = os.path.basename(path)
    if '.xlsx' in str(file_name).lower():
        df = pd.read_excel(path)
    elif '.csv' in str(file_name).lower():
        df = pd.read_csv(path)
    elif '.txt' in  str(file_name).lower():
        tweets = str(open(path, 'r', ).read()).strip().replace('\n',' ')
        tweets = filterText(text=tweets)
        tweets = [str(x).strip() for x in tweets.split('.')]
        df = pd.DataFrame(tweets,columns=['Content'])
    return df

def getKeywordBankDict(paths):
    keyword_bank = {}
    for path in paths:
        file_name = os.path.basename(path)
        if '.txt' in file_name:
            with open(path, 'r', encoding="utf-8") as file: 
                lines = [str(x).strip()  for x in file.readlines()]
                lines = list(dict.fromkeys([x for x in lines if bool(x)]))
                keyword_bank[file_name] = lines

    return keyword_bank  

def checkOccurance(text,file_name):
    keywords = keyword_bank[file_name]
    occurance = [word for word in keywords if str(word).strip().lower() in str(text).strip().lower()]
    if len(occurance)>0:
        return occurance[0] 
    
    return 0


results_folder_path = None
def checkResultsFolder(): 
    global results_folder_path
    results_folder_path = os.path.join( os.getcwd() , 'Results')
    if not os.path.exists(results_folder_path):
        os.makedirs(results_folder_path)


checkResultsFolder()








def tweetFileReportGenerator(path,keyword_files_paths,company_name=None,date=None,is_text_file=False):
    checkResultsFolder()
    df = readTweetsFile(path)
    # df = df.head(800)
    df['Content'] = df['Content'].apply(deEmojify)
 
    if not is_text_file:
        df = df.drop(columns=['PostID','Ticks', 'TweetUrl','PostID','RetweetNum', 'UserHandle', 'LikeNum', 'UserID', 'UserUrl','Location',])
        df['Time'] = df['Time'].apply(extractDateFromString)
        df.columns = df.columns.str.replace('UserName', 'Company Name') 
        df.columns = df.columns.str.replace('Time', 'Date') 
        # df.columns = df.columns.str.replace('Content', 'Tweet') 
    else:
        df['Date'] = [str(date)]*len(df['Content'])
        df['Company Name'] = [str(company_name)]*len(df['Content'])
 
    df['Length'] = df['Content'].apply(getWordCount)
    df['Polarity'] = df['Content'].apply(getPolarity)
    df['Subjectivity'] = df['Content'].apply(getSubjectivity)
    
    
    for path in keyword_files_paths[:]:
        file_name = os.path.basename(path)
        df[str(file_name).split('.')[0]] = df['Content'].apply(checkOccurance, args=[str(file_name)])

    # df.columns = df.columns.str.replace('Content', 'Tweet')
    
    
    df_columns = list(df.columns)
    df = df.groupby('Company Name') 
    df = [{'file_name':name,'data':list(data.values.tolist())} for name,data in df]
    
    for dataset in df:
        file_name = dataset['file_name']
        data = dataset['data']
        file_path = os.path.join(results_folder_path,file_name)
        new_df = pd.DataFrame(data=data,columns=df_columns)
        new_df.to_csv(file_path+'.csv',index=False)




# Interface
print("1. CSV or Excel")
print("2. Text file")
tweet_data_file_type = int(input("Enter file type [1,2] = "))
while tweet_data_file_type not in [1,2]:
    print("1. CSV or Excel")
    print("2. Text file")
    tweet_data_file_type = int(input("Enter file type [1,2] = "))


tweet_data_file_path = input("Enter tweets data file path = ")
while not os.path.exists(tweet_data_file_path):
    print("* Enter a valid path for tweets data file")
    tweet_data_file_path = input("Enter tweets data file path = ")


keyword_files_paths = []
total_keyword_files = int(input("Enter total number of keyword files = "))
while total_keyword_files < 1 :
    print("* Enter a valid number > 0")
    total_keyword_files = int(input("Enter total number of keyword files = "))

file_index = 1
while file_index <=total_keyword_files:
    path = input(f"Enter path for keyword file {file_index} = ")
    if path in keyword_files_paths:
        print(f"* Path ({path}) is already provided !")
        continue
    while  not os.path.exists(path):
        print(f"* Enter a valid path for keyword file {file_index}")
        path = input(f"Enter path for keyword file {file_index} = ")
    file_index = file_index+1
    keyword_files_paths.append(path)    

if tweet_data_file_type == 2:
    company_name = input("Enter company name = ")
    date = input("Enter date = ")

keyword_bank = getKeywordBankDict(paths=keyword_files_paths[:])

print("Processing ...")
if tweet_data_file_type == 2: 
    tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths,company_name=company_name,date=date,is_text_file=True)
else:
    tweetFileReportGenerator(path=tweet_data_file_path,keyword_files_paths=keyword_files_paths)

 

I replaced utf-8 with different options but it still not working any advise please? I am using MacOs... anychanse to fix this please? This my first day using stackoverflow sorry if my post is not professional as I am new in programming and not aware of fixing errors. Thank you so much for your kind support



No comments:

Post a Comment