SYBCS Scilab Program

Post a Comment

7 Comments

  1. import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.probability import FreqDist
    from heapq import nlargest


    nltk.download('punkt')
    nltk.download('stopwords')

    def preprocess_text(text):
    # Remove special characters and digits
    processed_text = re.sub(r'[^A-Za-z\s]', '', text)
    return processed_text

    def extractive_summarization(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Calculate word frequencies
    word_freq = FreqDist(filtered_words)

    # Calculate sentence scores based on word frequencies
    sentence_scores = {}
    for sentence in sentences:
    for word in word_tokenize(sentence.lower()):
    if word in word_freq:
    if sentence not in sentence_scores:
    sentence_scores[sentence] = word_freq[word]
    else:
    sentence_scores[sentence] += word_freq[word]

    # Select the top N sentences with the highest scores
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

    # Join the summary sentences to form the summary
    summary = ' '.join(summary_sentences)
    return summary

    if __name__ == "__main__":
    # Sample text paragraph
    text = """Data analytics is the process of examining data sets to draw conclusions about the information they contain,
    increasingly with the aid of specialized systems and software. Data analytics technologies and techniques are widely used
    in commercial industries to enable organizations to make more-informed business decisions. Scientists and researchers also
    use analytics tools to verify or disprove scientific models, theories and hypotheses."""

    # Preprocess the text
    processed_text = preprocess_text(text)
    print("Processed Text:\n", processed_text)

    # Generate summary using extractive summarization
    summary = extractive_summarization(processed_text)
    print("\nSummary:\n", summary)

    ReplyDelete
  2. import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.probability import FreqDist
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud

    # Download necessary NLTK resources
    nltk.download('punkt')
    nltk.download('stopwords')

    def preprocess_text(text):
    # Remove special characters and digits
    processed_text = re.sub(r'[^A-Za-z\s]', '', text)
    return processed_text

    def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

    def plot_word_frequencies(freq_dist):
    plt.figure(figsize=(10, 6))
    freq_dist.plot(30, cumulative=False)
    plt.show()

    def plot_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

    if __name__ == "__main__":
    # Sample text paragraph
    text = """Data analytics is the process of examining data sets to draw conclusions about the information they contain,
    increasingly with the aid of specialized systems and software. Data analytics technologies and techniques are widely used
    in commercial industries to enable organizations to make more-informed business decisions. Scientists and researchers also
    use analytics tools to verify or disprove scientific models, theories and hypotheses."""

    # Preprocess the text
    processed_text = preprocess_text(text)

    # Tokenize the paragraph to extract words and sentences
    words = word_tokenize(processed_text)
    sentences = sent_tokenize(text)

    # Remove stopwords
    filtered_words = remove_stopwords(words)

    # Calculate the word frequency distribution
    freq_dist = FreqDist(filtered_words)

    # Print results
    print("Processed Text:\n", processed_text)
    print("\nWords:\n", words)
    print("\nSentences:\n", sentences)
    print("\nFiltered Words:\n", filtered_words)
    print("\nWord Frequency Distribution:\n", freq_dist)

    # Plot word frequencies
    plot_word_frequencies(freq_dist)

    # Plot wordcloud
    plot_wordcloud(processed_text)

    ReplyDelete
  3. import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    # Download the VADER lexicon
    nltk.download('vader_lexicon')

    def sentiment_analysis(review):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(review)
    sentiment = 'Neutral'

    if sentiment_scores['compound'] >= 0.05:
    sentiment = 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
    sentiment = 'Negative'

    return sentiment, sentiment_scores

    if __name__ == "__main__":
    # Review messages
    reviews = [
    "I purchased headphones online. I am very happy with the product.",
    "I saw the movie yesterday. The animation was really good but the script was ok.",
    "I enjoy listening to music",
    "I take a walk in the park everyday"
    ]

    # Perform sentiment analysis on each review message
    for review in reviews:
    sentiment, scores = sentiment_analysis(review)
    print(f"Review: {review}")
    print(f"Sentiment: {sentiment}")
    print(f"Sentiment Scores: {scores}\n")

    ReplyDelete
  4. import re
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    # Download necessary NLTK resources
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

    def read_whatsapp_chat(file_path):
    with open('whatsapp_chat.txt', 'r', encoding='utf-8') as file:
    data = file.read()
    return data

    def preprocess_text(text):
    # Remove special characters and digits
    processed_text = re.sub(r'[^A-Za-z\s]', '', text)
    return processed_text

    def remove_stopwords_and_lemmatize(words):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return filtered_words

    def plot_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

    if __name__ == "__main__":
    # File path to the exported WhatsApp chat .txt file
    file_path = "path/to/whatsapp_chat.txt"

    # Step i: Read the exported WhatsApp chat .txt file
    chat_data = read_whatsapp_chat(file_path)

    # Step ii: Tokenize the read data into sentences
    sentences = sent_tokenize(chat_data)
    print("Tokenized Sentences:\n", sentences)

    # Tokenize the data into words
    words = word_tokenize(chat_data)

    # Step iii: Remove stopwords and perform lemmatization
    processed_text = preprocess_text(chat_data)
    filtered_words = remove_stopwords_and_lemmatize(word_tokenize(processed_text))

    # Print the processed words
    print("\nFiltered and Lemmatized Words:\n", filtered_words)

    # Join the filtered words for wordcloud generation
    filtered_text = ' '.join(filtered_words)

    # Step iv: Plot the wordcloud for the given data
    plot_wordcloud(filtered_text)

    ReplyDelete
  5. import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import nltk
    from nltk.corpus import stopwords
    from wordcloud import WordCloud

    # Download necessary NLTK resources
    nltk.download('stopwords')

    # Function to preprocess text by removing stopwords
    def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

    def convert_to_numeric(value):
    # Remove any commas and convert to float
    value = value.replace(',', '')
    if 'M' in value:
    return float(value.replace('M', '')) * 1_000_000
    elif 'K' in value:
    return float(value.replace('K', '')) * 1_000
    return float(value)

    # Load the dataset

    #url = 'https://www.kaggle.com/datasets/prasertk/top-1000-instagram-influencers/download'
    df = pd.read_csv('top_1000_instagrammers.csv')

    # Convert 'Followers' and 'Authentic engagement' columns to numeric
    df['Followers'] = df['Followers'].apply(convert_to_numeric)
    df['Authentic Engagement'] = df['Authentic Engagement'].apply(convert_to_numeric)

    # Inspect column names
    print("Column names:\n", df.columns)

    # Inspect the first few rows of the dataset
    print("\nFirst few rows of the dataset:\n", df.head())

    # i. Find the top 5 Instagram influencers from India
    top_5_india = df[df['Audience Country'] == 'India'].nlargest(5, 'Followers')
    print("\nTop 5 Instagram influencers from India:\n", top_5_india[['Name', 'Followers']])

    # ii. Find the Instagram account having the least number of followers
    least_followers = df.nsmallest(1, 'Followers')
    print("\nInstagram account with the least number of followers:\n", least_followers[['Name', 'Followers']])

    # iii. Read the "Category" column, remove stopwords, and plot the wordcloud
    categories = df['Category'].dropna().apply(preprocess_text)
    all_categories_text = ' '.join(categories)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_categories_text)

    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

    # iv. Group the Instagram accounts category-wise
    grouped_by_category = df.groupby('Category')['Name'].count().reset_index()
    print("\nNumber of Instagram accounts per category:\n", grouped_by_category)

    # v. Visualize the dataset and plot the relationship between Followers and Authentic engagement
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='Followers', y='Authentic Engagement')
    plt.title('Relationship between Followers and Authentic Engagement')
    plt.xlabel('Followers')
    plt.ylabel('Authentic Engagement')
    plt.show()

    ReplyDelete
  6. import pandas as pd
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

    # Download necessary NLTK resources
    nltk.download('punkt')
    nltk.download('stopwords')

    # i. Read the dataset and perform data cleaning operations on it
    url = 'covid_2021_1.csv'

    # Specify the correct delimiter (e.g., comma, semicolon, tab, etc.)
    delimiter = ',' # Change this if needed

    # Load the dataset
    df = pd.read_csv(url, delimiter=delimiter, error_bad_lines=False)

    # Perform data cleaning operations
    df.dropna(subset=['comment_text'], inplace=True) # Remove rows with missing comment_text
    df['comment_text'] = df['comment_text'].astype(str) # Ensure all comments are strings

    # Print the first few rows of the cleaned dataset
    print("Cleaned Dataset:\n", df.head())

    # ii. Tokenize the comments in words
    df['tokenized_comments'] = df['comment_text'].apply(word_tokenize)

    # Print the tokenized comments
    print("\nTokenized Comments:\n", df['tokenized_comments'].head())

    # iii. Perform sentiment analysis and find the percentage of positive, negative, and neutral comments
    analyzer = SentimentIntensityAnalyzer()

    def analyze_sentiment(text):
    sentiment_scores = analyzer.polarity_scores(text)
    if sentiment_scores['compound'] >= 0.05:
    return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
    return 'Negative'
    else:
    return 'Neutral'

    df['sentiment'] = df['comment_text'].apply(analyze_sentiment)

    # Calculate the percentage of each sentiment
    sentiment_counts = df['sentiment'].value_counts(normalize=True) * 100
    print("\nSentiment Analysis Percentages:\n", sentiment_counts)

    ReplyDelete
  7. import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Dictionary of file paths for different countries
    files = {
    "GB": "GBvideos.csv",
    "MX": "MXvideos.csv",
    "KR": "KRvideos.csv",
    "DE": "DEvideos.csv",
    "FR": "FRvideos.csv",
    "US": "USvideos.csv",
    "IN": "INvideos.csv",
    "RU": "RUvideos.csv",
    "JP": "JPvideos.csv",
    "CA": "CAvideos.csv"
    }

    # Read each CSV file into a DataFrame and store it in a dictionary
    df_dict = {country: pd.read_csv(path, encoding='latin1') for country, path in files.items()}

    # Perform operations on each DataFrame
    for country, df in df_dict.items():
    # Drop rows with missing values
    df = df.dropna()

    # Convert 'views', 'likes', 'dislikes', and 'comment_count' columns to numeric
    df['views'] = pd.to_numeric(df['views'], errors='coerce')
    df['likes'] = pd.to_numeric(df['likes'], errors='coerce')
    df['dislikes'] = pd.to_numeric(df['dislikes'], errors='coerce')
    df['comment_count'] = pd.to_numeric(df['comment_count'], errors='coerce')

    # Print the first few rows of the cleaned dataset
    print(f"\nCleaned Dataset for {country}:\n", df.head())

    # ii. Find the total views, total likes, total dislikes, and comment count
    total_views = df['views'].sum()
    total_likes = df['likes'].sum()
    total_dislikes = df['dislikes'].sum()
    total_comment_count = df['comment_count'].sum()

    print(f"\nTotal Views for {country}:", total_views)
    print(f"Total Likes for {country}:", total_likes)
    print(f"Total Dislikes for {country}:", total_dislikes)
    print(f"Total Comment Count for {country}:", total_comment_count)

    # iii. Find the least and topmost liked and commented videos
    topmost_liked_video = df.loc[df['likes'].idxmax()]
    least_liked_video = df.loc[df['likes'].idxmin()]
    topmost_commented_video = df.loc[df['comment_count'].idxmax()]
    least_commented_video = df.loc[df['comment_count'].idxmin()]

    print(f"\nTopmost Liked Video for {country}:\n", topmost_liked_video)
    print(f"\nLeast Liked Video for {country}:\n", least_liked_video)
    print(f"\nTopmost Commented Video for {country}:\n", topmost_commented_video)
    print(f"\nLeast Commented Video for {country}:\n", least_commented_video)

    # iv. Perform year-wise statistics for views and plot the analyzed data
    df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
    df['year'] = df['trending_date'].dt.year

    yearly_views = df.groupby('year')['views'].sum().reset_index()
    print(f"\nYear-wise Views for {country}:\n", yearly_views)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='year', y='views', data=yearly_views)
    plt.title(f'Year-wise Total Views for {country}')
    plt.xlabel('Year')
    plt.ylabel('Total Views')
    plt.show()

    # v. Plot the viewers who reacted on videos (Likes + Dislikes)
    df['total_reactions'] = df['likes'] + df['dislikes']

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='views', y='total_reactions')
    plt.title(f'Viewers Who Reacted on Videos for {country}')
    plt.xlabel('Views')
    plt.ylabel('Total Reactions (Likes + Dislikes)')
    plt.show()

    ReplyDelete