Skip to content

Commit c35c3c2

Browse files
authored
fix: update solutions to lc problem: No.3089 (doocs#2511)
No.3089.Find Bursty Behavior
1 parent 0e0add0 commit c35c3c2

File tree

4 files changed

+97
-58
lines changed

4 files changed

+97
-58
lines changed

solution/3000-3099/3089.Find Bursty Behavior/README.md

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ Each row of this table contains post_id, user_id, and post_date.
7979

8080
### 方法一:自连接 + 分组统计
8181

82-
我们可以使用自连接,将表 `Posts` 与自身连接,连接条件是 `p1.user_id = p2.user_id``p2.post_date``p1.post_date``p1.post_date``6` 天之间,然后我们将连接结果按照 `p1.user_id``p1.post_date` 分组,即可统计出每个用户在每天的 7 天内的发帖数量,我们将这个结果保存在表 `P` 中。
82+
我们可以使用自连接,将表 `Posts` 与自身连接,连接条件是 `p1.user_id = p2.user_id``p2.post_date``p1.post_date``p1.post_date``6` 天之间,然后我们将连接结果按照 `p1.user_id``p1.post_id` 分组,即可统计出每个用户在每天的 7 天内的发帖数量,我们将这个结果保存在表 `P` 中。
8383

8484
接着我们统计出每个用户在 2024 年 2 月份的每周平均发帖数量,保存在表 `T` 中。注意,我们需要查找 `post_date``2024-02-01``2024-02-28` 之间的记录,将记录按照 `user_id` 分组,然后统计每个用户的发帖数量,最后除以 `4` 即可得到每周平均发帖数量,我们将这个结果保存在表 `T` 中。
8585

@@ -97,7 +97,7 @@ WITH
9797
JOIN Posts AS p2
9898
ON p1.user_id = p2.user_id
9999
AND p2.post_date BETWEEN p1.post_date AND DATE_ADD(p1.post_date, INTERVAL 6 DAY)
100-
GROUP BY p1.user_id, p1.post_date
100+
GROUP BY p1.user_id, p1.post_id
101101
),
102102
T AS (
103103
SELECT user_id, COUNT(1) / 4 AS avg_weekly_posts
@@ -117,26 +117,49 @@ ORDER BY 1;
117117
```python
118118
import pandas as pd
119119

120+
120121
def find_bursty_behavior(posts: pd.DataFrame) -> pd.DataFrame:
121-
# 计算每个用户在7天窗口内发布的帖子数
122-
p = posts.merge(posts, on='user_id')
123-
p = p[(p['post_date_y'] >= p['post_date_x']) &
124-
(p['post_date_y'] <= p['post_date_x'] + pd.Timedelta(days=6))]
125-
p_count = p.groupby(['user_id', 'post_date_x']).size().reset_index(name='cnt')
126-
127-
# 计算每个用户在2024年2月期间的平均每周发布的帖子数
128-
t = posts[(posts['post_date'] >= '2024-02-01') &
129-
(posts['post_date'] <= '2024-02-28')]
130-
t_count = t.groupby('user_id').size().reset_index(name='count')
131-
t_count['avg_weekly_posts'] = t_count['count'] / 4
132-
133-
# 合并两个计算出的表,并过滤符合条件的用户
134-
merged_df = p_count.merge(t_count, on='user_id')
135-
merged_df = merged_df.groupby('user_id').agg(max_7day_posts=('cnt', 'max'),
136-
avg_weekly_posts=('avg_weekly_posts', 'first'))
137-
result_df = merged_df[merged_df['max_7day_posts'] >= merged_df['avg_weekly_posts'] * 2].reset_index()
138-
139-
return result_df.sort_values('user_id')
122+
# 子查询 P
123+
p1 = pd.merge(
124+
posts, posts, on="user_id", suffixes=("_1", "_2")
125+
) # 合并帖子表自身,根据用户ID
126+
p1 = p1[
127+
p1["post_date_2"].between(
128+
p1["post_date_1"], p1["post_date_1"] + pd.Timedelta(days=6)
129+
)
130+
] # 筛选出相邻 7 天内的帖子
131+
p1 = (
132+
p1.groupby(["user_id", "post_id_1"]).size().reset_index(name="cnt")
133+
) # 统计每个用户在相邻 7 天内的帖子数
134+
135+
# 子查询 T
136+
t = posts[
137+
(posts["post_date"] >= "2024-02-01") & (posts["post_date"] <= "2024-02-28")
138+
] # 筛选出 2024 年 2 月份的帖子
139+
t = (
140+
t.groupby("user_id").size().div(4).reset_index(name="avg_weekly_posts")
141+
) # 计算每个用户平均每周的帖子数
142+
143+
# 连接 P 和 T
144+
merged_df = pd.merge(p1, t, on="user_id", how="inner") # 内连接 P 和 T
145+
146+
# 过滤
147+
filtered_df = merged_df[
148+
merged_df["cnt"] >= merged_df["avg_weekly_posts"] * 2
149+
] # 过滤出满足条件的行
150+
151+
# 聚合
152+
result_df = (
153+
filtered_df.groupby("user_id")
154+
.agg({"cnt": "max", "avg_weekly_posts": "first"})
155+
.reset_index()
156+
) # 对满足条件的行按用户ID聚合
157+
result_df.columns = ["user_id", "max_7day_posts", "avg_weekly_posts"] # 重命名列名
158+
159+
# 排序
160+
result_df.sort_values(by="user_id", inplace=True) # 按用户ID排序
161+
162+
return result_df
140163
```
141164

142165
<!-- tabs:end -->

solution/3000-3099/3089.Find Bursty Behavior/README_EN.md

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Each row of this table contains post_id, user_id, and post_date.
7777

7878
### Solution 1: Self-Join + Group Count
7979

80-
We can use self-join to connect the `Posts` table with itself. The connection condition is `p1.user_id = p2.user_id` and `p2.post_date` is between `p1.post_date` and 6 days after `p1.post_date`. Then we group the connection results by `p1.user_id` and `p1.post_date` to count the number of posts for each user within 7 days of each day. We save this result in table `P`.
80+
We can use self-join to connect the `Posts` table with itself. The connection condition is `p1.user_id = p2.user_id` and `p2.post_date` is between `p1.post_date` and 6 days after `p1.post_date`. Then we group the connection results by `p1.user_id` and `p1.post_id` to count the number of posts for each user within 7 days of each day. We save this result in table `P`.
8181

8282
Next, we count the average number of posts per week for each user in February 2024 and save it in table `T`. Note that we need to find records where `post_date` is between `2024-02-01` and `2024-02-28`, group the records by `user_id`, then count the number of posts for each user, and finally divide by `4` to get the average number of posts per week. We save this result in table `T`.
8383

@@ -95,7 +95,7 @@ WITH
9595
JOIN Posts AS p2
9696
ON p1.user_id = p2.user_id
9797
AND p2.post_date BETWEEN p1.post_date AND DATE_ADD(p1.post_date, INTERVAL 6 DAY)
98-
GROUP BY p1.user_id, p1.post_date
98+
GROUP BY p1.user_id, p1.post_id
9999
),
100100
T AS (
101101
SELECT user_id, COUNT(1) / 4 AS avg_weekly_posts
@@ -117,31 +117,39 @@ import pandas as pd
117117

118118

119119
def find_bursty_behavior(posts: pd.DataFrame) -> pd.DataFrame:
120-
# Calculate the count of posts made by each user within a 7-day window
121-
p = posts.merge(posts, on="user_id")
122-
p = p[
123-
(p["post_date_y"] >= p["post_date_x"])
124-
& (p["post_date_y"] <= p["post_date_x"] + pd.Timedelta(days=6))
120+
# Subquery P
121+
p1 = pd.merge(posts, posts, on="user_id", suffixes=("_1", "_2"))
122+
p1 = p1[
123+
p1["post_date_2"].between(
124+
p1["post_date_1"], p1["post_date_1"] + pd.Timedelta(days=6)
125+
)
125126
]
126-
p_count = p.groupby(["user_id", "post_date_x"]).size().reset_index(name="cnt")
127+
p1 = p1.groupby(["user_id", "post_id_1"]).size().reset_index(name="cnt")
127128

128-
# Calculate the average weekly posts for each user in February 2024
129+
# Subquery T
129130
t = posts[
130131
(posts["post_date"] >= "2024-02-01") & (posts["post_date"] <= "2024-02-28")
131132
]
132-
t_count = t.groupby("user_id").size().reset_index(name="count")
133-
t_count["avg_weekly_posts"] = t_count["count"] / 4
133+
t = t.groupby("user_id").size().div(4).reset_index(name="avg_weekly_posts")
134134

135-
# Joining the two calculated tables and filtering users meeting the criteria
136-
merged_df = p_count.merge(t_count, on="user_id")
137-
merged_df = merged_df.groupby("user_id").agg(
138-
max_7day_posts=("cnt", "max"), avg_weekly_posts=("avg_weekly_posts", "first")
135+
# Joining P and T
136+
merged_df = pd.merge(p1, t, on="user_id", how="inner")
137+
138+
# Filtering
139+
filtered_df = merged_df[merged_df["cnt"] >= merged_df["avg_weekly_posts"] * 2]
140+
141+
# Aggregating
142+
result_df = (
143+
filtered_df.groupby("user_id")
144+
.agg({"cnt": "max", "avg_weekly_posts": "first"})
145+
.reset_index()
139146
)
140-
result_df = merged_df[
141-
merged_df["max_7day_posts"] >= merged_df["avg_weekly_posts"] * 2
142-
].reset_index()
147+
result_df.columns = ["user_id", "max_7day_posts", "avg_weekly_posts"]
148+
149+
# Sorting
150+
result_df.sort_values(by="user_id", inplace=True)
143151

144-
return result_df.sort_values("user_id")
152+
return result_df
145153
```
146154

147155
<!-- tabs:end -->

solution/3000-3099/3089.Find Bursty Behavior/Solution.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,36 @@
22

33

44
def find_bursty_behavior(posts: pd.DataFrame) -> pd.DataFrame:
5-
# Calculate the count of posts made by each user within a 7-day window
6-
p = posts.merge(posts, on="user_id")
7-
p = p[
8-
(p["post_date_y"] >= p["post_date_x"])
9-
& (p["post_date_y"] <= p["post_date_x"] + pd.Timedelta(days=6))
5+
# Subquery P
6+
p1 = pd.merge(posts, posts, on="user_id", suffixes=("_1", "_2"))
7+
p1 = p1[
8+
p1["post_date_2"].between(
9+
p1["post_date_1"], p1["post_date_1"] + pd.Timedelta(days=6)
10+
)
1011
]
11-
p_count = p.groupby(["user_id", "post_date_x"]).size().reset_index(name="cnt")
12+
p1 = p1.groupby(["user_id", "post_id_1"]).size().reset_index(name="cnt")
1213

13-
# Calculate the average weekly posts for each user in February 2024
14+
# Subquery T
1415
t = posts[
1516
(posts["post_date"] >= "2024-02-01") & (posts["post_date"] <= "2024-02-28")
1617
]
17-
t_count = t.groupby("user_id").size().reset_index(name="count")
18-
t_count["avg_weekly_posts"] = t_count["count"] / 4
18+
t = t.groupby("user_id").size().div(4).reset_index(name="avg_weekly_posts")
1919

20-
# Joining the two calculated tables and filtering users meeting the criteria
21-
merged_df = p_count.merge(t_count, on="user_id")
22-
merged_df = merged_df.groupby("user_id").agg(
23-
max_7day_posts=("cnt", "max"), avg_weekly_posts=("avg_weekly_posts", "first")
20+
# Joining P and T
21+
merged_df = pd.merge(p1, t, on="user_id", how="inner")
22+
23+
# Filtering
24+
filtered_df = merged_df[merged_df["cnt"] >= merged_df["avg_weekly_posts"] * 2]
25+
26+
# Aggregating
27+
result_df = (
28+
filtered_df.groupby("user_id")
29+
.agg({"cnt": "max", "avg_weekly_posts": "first"})
30+
.reset_index()
2431
)
25-
result_df = merged_df[
26-
merged_df["max_7day_posts"] >= merged_df["avg_weekly_posts"] * 2
27-
].reset_index()
32+
result_df.columns = ["user_id", "max_7day_posts", "avg_weekly_posts"]
33+
34+
# Sorting
35+
result_df.sort_values(by="user_id", inplace=True)
2836

29-
return result_df.sort_values("user_id")
37+
return result_df

solution/3000-3099/3089.Find Bursty Behavior/Solution.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ WITH
77
JOIN Posts AS p2
88
ON p1.user_id = p2.user_id
99
AND p2.post_date BETWEEN p1.post_date AND DATE_ADD(p1.post_date, INTERVAL 6 DAY)
10-
GROUP BY p1.user_id, p1.post_date
10+
GROUP BY p1.user_id, p1.post_id
1111
),
1212
T AS (
1313
SELECT user_id, COUNT(1) / 4 AS avg_weekly_posts

0 commit comments

Comments
 (0)