Saturday, December 17, 2022

Facets in Whoosh - an example !

from whoosh import index, qparser
from whoosh.fields import *

# Define the schema for the index
schema = Schema(title=TEXT(stored=True),
                author=TEXT(stored=True),
                category=KEYWORD(stored=True),
                content=TEXT)

# Create the index
ix = index.create_in("indexdir", schema)

# Open the index for writing
writer = ix.writer()

# Add documents to the index
writer.add_document(title="Document 1",
                    author="Author 1",
                    category="Category 1",
                    content="This is the content of document 1")
writer.add_document(title="Document 2",
                    author="Author 2",
                    category="Category 2",
                    content="This is the content of document 2")
writer.add_document(title="Document 3",
                    author="Author 3",
                    category="Category 1",
                    content="This is the content of document 3")

# Commit the changes
writer.commit()

# Open the index for reading
searcher = ix.searcher()

# Parse the query
parser = qparser.QueryParser("content", ix.schema)
query = parser.parse("document")

# Perform the search and return the results
results = searcher.search(query)

# Iterate through the results and print them
for result in results:
    print(result)

# Faceted search:
facet = searcher.facet_by_fieldname("category")
for category in facet:
    print(f"Category: {category}")
    print(f"Number of documents: {facet[category]}")

Friday, December 16, 2022

How to add documents to the existing index in Whoosh?

To add documents to an existing index in Whoosh, you will need to follow these steps: 


1. First, you will need to open the index using the whoosh.index.open_dir function. This function takes the directory where the index is stored as an argument and returns an Index object:

from whoosh import index

# Open the index
ix = index.open_dir("indexdir")



2. Next, you will need to create a whoosh.writing.IndexWriter object using the Index.writer method. The IndexWriter object allows you to add documents to the index:

# Open an index writer
writer = ix.writer()


3. Now you can use the IndexWriter.add_document method to add documents to the index. The add_document method takes a dictionary of fields and values as an argument. The keys of the dictionary should match the field names in the index's schema, and the values should be the field values:

# Add a document to the index
writer.add_document( title="My Document", body="This is the body of my document.", date="2022-01-01", )  


4. After you have added all the documents you want to add, you will need to call the IndexWriter.commit method to save the changes to the index: 


# Commit the changes
writer.commit()

Thursday, July 8, 2021

Comparing two dimensionally different dataframes based on one mutual column

Comparing two dimensionally different dataframes based on one mutual column
In [1]:
import pandas as pd
In [2]:
def compareDF(df1,df2,ColumnName, diff=''):
    
    leftList=[]
    rightList=[]
    for i in list(df1):
        leftList.append(str(i)+'_l')
    for i in list(df2):
        rightList.append(str(i)+'_r')
    
    
    df1.columns=leftList
    df1=df1.rename(columns={str(ColumnName)+"_l": ColumnName})
    df2.columns=rightList
    df2=df2.rename(columns={str(ColumnName)+"_r": ColumnName})
    df = pd.merge(df1,df2, on=ColumnName, suffixes=('', '') ,how='outer')
    df=df.fillna(0)
    df.sort_values(ColumnName, inplace=True)
    dropList=[]
    for index, row in df.iterrows():
        total=0
        for i in list(df):
            if i==ColumnName:
                continue
            else:
                total=row[i]+total
        if total==0:
            dropList.append(index)
    df=df.drop(dropList)
    df=df.reset_index(drop=True)
    
    #if diff=='x':
        
    
    return df
In [3]:
def formatX(c):  # Optional
    c=str(c)
    c=c.replace(',', '')
    c=c.replace('$','')
    return float(str(c))
In [4]:
def cleanDF(df, columnName): 
    df=df.fillna(0)
    for i in list(df):
        try:
            if i != columnName:
                df[i] = df.apply(lambda x: formatX(x[i]),axis=1)
        except Exception as e:
            continue
    return df
In [5]:
def getDiff(df, columnName):
    
    leftList=[]
    rightList=[]
    
    for i in list(df):
        if str(i)[-2:]=='_l':
            leftList.append(str(i)[:-2])
        elif str(i)[-2:]=='_r':
            rightList.append(str(i)[:-2])
    
    for i in leftList:
        if i not in rightList:
            df[str(i+'_r')]=0
    
    for i in rightList:
        if i not in leftList:
            df[str(i+'_l')]=0

    for i in list(df):
        try:
            if i != columnName and str(i)[-2:]!='_r':
                if str(i)[:-2]+'_r' in list(df):
                    df[str(i)[:-2]]=df[str(i)[:-2]+'_r']-df[i]
                else:
                    print("not there",str(i)[:-2]+'_r', list(df))
            elif '_r' in str(i):
                pass
                #df=df.drop([i], axis=1)
        except Exception as e:
            #print("Error:", str(e))
            continue
    
    for i in list(df):
        if str(i)[-2:]=='_l' or str(i)[-2:]=='_r':
            df=df.drop([i], axis=1)
        
    return df
    
    
    
            
In [6]:
df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B"], data=[[1,2,3], [2,3,6]])
In [7]:
df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df1,df2,'id')
In [8]:
result.head()
Out[8]:
id A_l B_l A_r B_r
0 1 5.0 3.0 2.0 3.0
1 2 3.0 5.0 3.0 6.0
In [10]:
# Right - Left
result2=getDiff(result, 'id')
not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B']
not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B']
In [11]:
result2.head()
Out[11]:
id A B
0 1 -3.0 0.0
1 2 0.0 1.0
In [18]:
df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B", "C"], data=[[1,2,3,4], [2,3,6,2], [12,13,16,1]])
In [19]:
df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df2,df1,'id')
In [20]:
result.head()
Out[20]:
id A_l B_l C_l A_r B_r
0 1 2.0 3.0 4.0 5.0 3.0
1 2 3.0 6.0 2.0 3.0 5.0
2 12 13.0 16.0 1.0 0.0 0.0
In [21]:
result2=getDiff(result, 'id')
In [22]:
result2.head()
Out[22]:
id A B C
0 1 3.0 0.0 -4.0
1 2 0.0 -1.0 -2.0
2 12 -13.0 -16.0 -1.0
In [ ]:
 
In [ ]: