Source code for lda_over_time.models.lda_seq_model

"""
LdaSeqModel brings the Gensim's LdaSeqModel functionalities to our library.

Its main advantage over other models is that it can detect changes in the \
vocabulary used to describe each topic over time, making it more precise in \
classifying each topic. But it is slower to run.
"""
# IMPORTS
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldaseqmodel import LdaSeqModel as LdaSeqModel_
from lda_over_time.models.dtm_model_interface import DtmModelInterface
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import warnings


# TYPING
from typing import List, Optional


warnings.filterwarnings("ignore")


[docs]class LdaSeqModel(DtmModelInterface):
    """
    LdaSeqModel is a model that uses the Gensim's LdaSeqModel, which \
    supports the variance along time of the way that a certain topic is \
    approached (it can detect better the change of vocabulary to speak a \
    certain topic).

    With this feature, it may be more precise than PrevalenceModel, but \
    it is slower.

    :param corpus: List of documents' texts.
    :type corpus: list[str]

    :param dates: List of documents' publishing dates.
    :type dates: list[str]

    :param date_format: The date format used in `dates`, e.g. "%Y/%m/%d" for \
    "YYYY/MM/DD" format. More info at `documentation`_.
    :type date_format: str

    :param freq: The frequency used to group texts, e.g. "1M15D" for a \
    frequency of a month and 15 days. Useful notations: \
    day = "D"\
    month = "M"; \
    year = "Y". \
    More info at `pandas`_
    :type freq: str

    :param n_topics: Number of topics that the DTM model should find. The \
    default value is 100.
    :type n_topics: int, optional

    :param sep: Separator used to split each word, the default value is any \
    blank space.
    :type sep: str, optional

    :param workers: Number of workers (cpus) to use. If not provided, it will \
    use the total number of threads on running machine.
    :type workers: int, optional

    :return: Nothing
    :rtype: None

    .. _documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
    .. _pandas: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
    """

    def __init__(self,
                 corpus: List[str],
                 dates: List[str],
                 date_format: str,
                 freq: str,
                 n_topics: int = 100,
                 sep: Optional[str] = None,
                 workers: Optional[int] = None):
        """
        I initialize the variables to train model.
        """
        # Save arguments inside object
        self.corpus = corpus
        self.dates = dates
        self.date_format = date_format
        self.freq = freq
        self.n_topics = n_topics
        self.sep = sep

        # Assign workers equals to arguments' if passed or set default
        self.workers = workers if workers is not None else cpu_count()


    def __prepare_corpus(self, corpus, dates, date_format, freq, sep):
        """
        This method prepares the corpus and dates in a supported format to
        LdaSeqModel.
        """
        # load dates with pandas with given format
        parsed_dates = pd.to_datetime(dates, format=date_format)

        # sort documents by date
        date_doc_df = pd.DataFrame({'date': parsed_dates,
                                    'text': corpus}).sort_values('date')

        # group by frequency
        grouped = date_doc_df.groupby(pd.Grouper(key='date', freq=freq),
                                      sort=True)

        # get length of each timeslice ordered by time
        timeslices = list(grouped.size().values)

        # get timestamps for all time slices
        timestamps = list(grouped.groups.keys())

        # Split words for each document
        data = [doc.split(sep) for doc in date_doc_df['text'].values]

        # Train Dictionary
        dictionary = Dictionary(data)

        # Convert corpus to list of BoWs
        corpus_bow = [dictionary.doc2bow(doc) for doc in data]

        # return values
        return corpus_bow, dictionary, timestamps, timeslices


    def __prepare_results(self,
                          model,
                          timestamps,
                          n_documents,
                          n_topics,
                          time_slices):
        """
        This method extracts the main topic for each document in corpus and \
        calculates the proportion of each topic in a specific time.
        """
        #
        # TODO: find better way of doing this (more performatic)
        #
        # get main topic for each document
        main_topics = []
        for i in range(n_documents):

            # get the distribution of topics over document
            distribution = model.doc_topics(i)

            # get main topic
            main = distribution.argmax()

            # save main topic
            main_topics.append(main)

        # calculate initial and final position to calculate for each time slice
        accumulated = np.add.accumulate([0] + time_slices)
        start_end = zip(accumulated[:-1], accumulated[1:])

        # list of proportions for each time slice
        proportions = []

        # calculate proportion for each timeslice
        for ini, end in start_end:

            # count occurrences for each topic
            count = np.bincount(main_topics[ini:end])

            # fill missing topics with zero
            count = np.append(count, [0] * (n_topics - len(count)))

            # calculate proportion of each topic in a specific time slice
            proportion = count / np.sum(count)

            # append a dictionary of dict[topic, proportion]
            proportions.append(dict(zip(range(n_topics), proportion)))

        # create dataframe
        result = pd.DataFrame.from_dict(proportions)

        # include field of timestamps
        result['date'] = timestamps

        # return dataframe of proportion of topics for each time slice
        return result

[docs]    def get_results(self):
        """
        This method should return a table representing the evolution of each \
        topic over time.

        :return: Returns a Pandas' dataframe where each column represents a \
        timeslice and must have a `date` and columns representing each \
        topics weight in that period.
        :rtype: pd.core.frame.DataFrame
        """
        return self.results


[docs]    def get_topic_words(self, topic_id, i, n):
        """
        This method returns the top n words that better describes the \
        topic in a specific time slice.

        :param topic_id: The id of the desired topic.
        :type topic_id: int

        :param i: The position of the desired timeslice in chronological \
        order the first (oldest) time slice is indexed by 1.
        :type i: int

        :param n: This specifies how many words that better describes the \
        topic at a specific time slice should be returned.
        :type n: int

        :return: It returns a list of top n words that best describes the \
        requested topic in a specific time.
        :rtype: list[str]

        """
        return [t[0] for t in self.model.print_topic(topic_id, i - 1, n)]


    @property
    def n_timeslices(self):
        """
        This attribute should be the number of timeslices found during \
        training.

        :return: It should return the number of time slices found in corpus. \
        :rtype: int
        """
        return len(self.time_slices)


[docs]    def prepare_args(self, i):
        """
        This method should return a dictionary with all necessary values to \
        call PyLdaVis.prepare method.

        :param i: The position of the desired timeslice in chronological \
        order, the first (oldest) time slice is indexed by 1.
        :type i: int

        :return: It returns a dictionary ready to be passed to PyLdaVis
        :rtype: dict[str, any]

        """
        # Calculate parameters
        doc_topics, topic_term, doc_lengths, term_frequency, vocab = \
                self.model.dtm_vis(time=i - 1, corpus=self.bow)

        # Return dictionary with parameters
        return {
            'topic_term_dists': topic_term,
            'doc_topic_dists': doc_topics,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency,
        }


[docs]    def train(self):
        """
        Train the DTM model.

        :return: Nothing.
        :rtype: None
        """
        # prepare data to use in training
        self.bow, self.dictionary, self.timestamps, self.time_slices = \
                self.__prepare_corpus(self.corpus,
                                      self.dates,
                                      self.date_format,
                                      self.freq,
                                      self.sep)

        # train dtm model
        self.model = LdaSeqModel_(corpus=self.bow,
                                  time_slice=self.time_slices,
                                  num_topics=self.n_topics,
                                  id2word=self.dictionary)

        # calculate result
        self.results = self.__prepare_results(self.model,
                                              self.timestamps,
                                              len(self.bow),
                                              self.n_topics,
                                              self.time_slices)